def value(self): """ Return the broadcasted value """ if not hasattr(self, "_value") and self.path is not None: ser = CompressedSerializer(PickleSerializer()) self._value = ser.load_stream(open(self.path)).next() return self._value
def __getattr__(self, item): if item == 'value' and self.path is not None: ser = CompressedSerializer(PickleSerializer()) value = ser.load_stream(open(self.path)).next() self.value = value return value raise AttributeError(item)
def main(infile, outfile): try: boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests return # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH sys.path.append(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) sys.path.append(os.path.join(spark_files_dir, filename)) # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) ser = CompressedSerializer(pickleSer) for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: value = ser._read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, value) else: bid = - bid - 1 _broadcastRegistry.remove(bid) _accumulatorRegistry.clear() command = pickleSer._read_with_length(infile) (func, deserializer, serializer) = command init_time = time.time() iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) except Exception: try: write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc(), outfile) outfile.flush() except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print >> sys.stderr, "PySpark worker failed with exception:" print >> sys.stderr, traceback.format_exc() exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile)
def main(infile, outfile): try: boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests return # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH sys.path.append(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) sys.path.append(os.path.join(spark_files_dir, filename)) # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) ser = CompressedSerializer(pickleSer) for _ in range(num_broadcast_variables): bid = read_long(infile) value = ser._read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, value) command = pickleSer._read_with_length(infile) (func, deserializer, serializer) = command init_time = time.time() iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) except Exception: try: write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc(), outfile) outfile.flush() except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print >> sys.stderr, "PySpark worker failed with exception:" print >> sys.stderr, traceback.format_exc() exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile)
def broadcast(self, value): """ Broadcast a read-only variable to the cluster, returning a L{Broadcast<pyspark.broadcast.Broadcast>} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ ser = CompressedSerializer(PickleSerializer()) # pass large object by py4j is very slow and need much memory tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) ser.dump_stream([value], tempFile) tempFile.close() jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name) return Broadcast(jbroadcast.id(), None, jbroadcast, self._pickled_broadcast_vars, tempFile.name)
def _open_file(self): dirs = _get_local_dirs("objects") d = dirs[id(self) % len(dirs)] if not os.path.exists(d): os.makedirs(d) p = os.path.join(d, str(id(self))) self._file = open(p, "w+b", 65536) self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) os.unlink(p)
def test_compressed_serializer(self): ser = CompressedSerializer(PickleSerializer()) from io import BytesIO as StringIO io = StringIO() ser.dump_stream(["abc", u"123", range(5)], io) io.seek(0) self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io))) ser.dump_stream(range(1000), io) io.seek(0) self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io))) io.close()
def test_hash_serializer(self): hash(NoOpSerializer()) hash(UTF8Deserializer()) hash(CPickleSerializer()) hash(MarshalSerializer()) hash(AutoSerializer()) hash(BatchedSerializer(CPickleSerializer())) hash(AutoBatchedSerializer(MarshalSerializer())) hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer())) hash(CompressedSerializer(CPickleSerializer())) hash(FlattenedValuesSerializer(CPickleSerializer()))
def test_compressed_serializer(self): ser = CompressedSerializer(PickleSerializer()) try: from StringIO import StringIO except ImportError: from io import BytesIO as StringIO io = StringIO() ser.dump_stream(["abc", u"123", range(5)], io) io.seek(0) self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io))) ser.dump_stream(range(1000), io) io.seek(0) self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io))) io.close()
def _compressed_serializer(self, serializer=None): # always use PickleSerializer to simplify implementation ser = PickleSerializer() return AutoBatchedSerializer(CompressedSerializer(ser))
def main(infile, outfile): try: boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests exit(-1) # initialize global state shuffle.MemoryBytesSpilled = 0 shuffle.DiskBytesSpilled = 0 _accumulatorRegistry.clear() # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH add_path( spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) add_path(os.path.join(spark_files_dir, filename)) # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) ser = CompressedSerializer(pickleSer) for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: value = ser._read_with_length(infile) _broadcastRegistry[bid] = Broadcast(bid, value) else: bid = -bid - 1 _broadcastRegistry.pop(bid) _accumulatorRegistry.clear() command = pickleSer._read_with_length(infile) if isinstance(command, Broadcast): command = pickleSer.loads(command.value) (func, stats, deserializer, serializer) = command init_time = time.time() def process(): iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) if stats: p = cProfile.Profile() p.runcall(process) st = pstats.Stats(p) st.stream = None # make it picklable stats.add(st.strip_dirs()) else: process() except Exception: try: write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc(), outfile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print >> sys.stderr, "PySpark worker failed with exception:" print >> sys.stderr, traceback.format_exc() exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) write_long(shuffle.MemoryBytesSpilled, outfile) write_long(shuffle.DiskBytesSpilled, outfile) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile) # check end of stream if read_int(infile) == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) exit(-1)