def add_shuffle_key(split, iterator): buckets = defaultdict(list) for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): yield str(split) yield dump_pickle(Batch(items))
def main(): split_index = read_int(sys.stdin) spark_files_dir = load_pickle(read_with_length(sys.stdin)) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True sys.path.append(spark_files_dir) num_broadcast_variables = read_int(sys.stdin) for _ in range(num_broadcast_variables): bid = read_long(sys.stdin) value = read_with_length(sys.stdin) _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value)) func = load_obj() bypassSerializer = load_obj() if bypassSerializer: dumps = lambda x: x else: dumps = dump_pickle iterator = read_from_pickle_file(sys.stdin) try: for obj in func(split_index, iterator): write_with_length(dumps(obj), old_stdout) except Exception as e: write_int(-2, old_stdout) write_with_length(traceback.format_exc(), old_stdout) sys.exit(-1) # Mark the beginning of the accumulators section of the output write_int(-1, old_stdout) for aid, accum in _accumulatorRegistry.items(): write_with_length(dump_pickle((aid, accum._value)), old_stdout)
def broadcast(self, value): """ Broadcast a read-only variable to the cluster, returning a C{Broadcast} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ jbroadcast = self._jsc.broadcast(bytearray(dump_pickle(value))) return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
def main(infile, outfile): boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests return # fetch name of workdir spark_files_dir = load_pickle(read_with_length(infile)) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) for _ in range(num_broadcast_variables): bid = read_long(infile) value = read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value)) # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH sys.path.append( spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): sys.path.append( os.path.join(spark_files_dir, load_pickle(read_with_length(infile)))) # now load function func = load_obj(infile) bypassSerializer = load_obj(infile) if bypassSerializer: dumps = lambda x: x else: dumps = dump_pickle init_time = time.time() iterator = read_from_pickle_file(infile) try: for obj in func(split_index, iterator): write_with_length(dumps(obj), outfile) except Exception as e: write_int(-2, outfile) write_with_length(traceback.format_exc(), outfile) sys.exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) # Mark the beginning of the accumulators section of the output write_int(-1, outfile) for aid, accum in _accumulatorRegistry.items(): write_with_length(dump_pickle((aid, accum._value)), outfile) write_int(-1, outfile)
def main(infile, outfile): boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests return # fetch name of workdir spark_files_dir = load_pickle(read_with_length(infile)) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) for _ in range(num_broadcast_variables): bid = read_long(infile) value = read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value)) # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH sys.path.append(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): sys.path.append(os.path.join(spark_files_dir, load_pickle(read_with_length(infile)))) # now load function func = load_obj(infile) bypassSerializer = load_obj(infile) if bypassSerializer: dumps = lambda x: x else: dumps = dump_pickle init_time = time.time() iterator = read_from_pickle_file(infile) try: for obj in func(split_index, iterator): write_with_length(dumps(obj), outfile) except Exception as e: write_int(-2, outfile) write_with_length(traceback.format_exc(), outfile) sys.exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) # Mark the beginning of the accumulators section of the output write_int(-1, outfile) for aid, accum in _accumulatorRegistry.items(): write_with_length(dump_pickle((aid, accum._value)), outfile) write_int(-1, outfile)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) if self.batchSize != 1: c = batched(c, self.batchSize) for x in c: write_with_length(dump_pickle(x), tempFile) tempFile.close() readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False) atexit.register(lambda: os.unlink(tempFile.name)) if self.batchSize != 1: c = batched(c, self.batchSize) for x in c: write_with_length(dump_pickle(x), tempFile) tempFile.close() jrdd = self._readRDDFromPickleFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. >>> sc.parallelize(range(5), 5).glom().collect() [[0], [1], [2], [3], [4]] """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = min(len(c) // numSlices, self.batchSize) if batchSize > 1: c = batched(c, batchSize) for x in c: write_with_length(dump_pickle(x), tempFile) tempFile.close() readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self)
def main(infile, outfile): boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests return spark_files_dir = load_pickle(read_with_length(infile)) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True sys.path.append(spark_files_dir) num_broadcast_variables = read_int(infile) for _ in range(num_broadcast_variables): bid = read_long(infile) value = read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value)) func = load_obj(infile) bypassSerializer = load_obj(infile) if bypassSerializer: dumps = lambda x: x else: dumps = dump_pickle init_time = time.time() iterator = read_from_pickle_file(infile) try: for obj in func(split_index, iterator): write_with_length(dumps(obj), outfile) except Exception as e: write_int(-2, outfile) write_with_length(traceback.format_exc(), outfile) sys.exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) # Mark the beginning of the accumulators section of the output write_int(-1, outfile) for aid, accum in _accumulatorRegistry.items(): write_with_length(dump_pickle((aid, accum._value)), outfile) write_int(-1, outfile)