Example #1
0
 def value(self):
     """ Return the broadcasted value
     """
     if not hasattr(self, "_value") and self.path is not None:
         ser = CompressedSerializer(PickleSerializer())
         self._value = ser.load_stream(open(self.path)).next()
     return self._value
Example #2
0
 def value(self):
     """ Return the broadcasted value
     """
     if not hasattr(self, "_value") and self.path is not None:
         ser = CompressedSerializer(PickleSerializer())
         self._value = ser.load_stream(open(self.path)).next()
     return self._value
Example #3
0
    def __getattr__(self, item):
        if item == 'value' and self.path is not None:
            ser = CompressedSerializer(PickleSerializer())
            value = ser.load_stream(open(self.path)).next()
            self.value = value
            return value

        raise AttributeError(item)
Example #4
0
def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            return

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            sys.path.append(os.path.join(spark_files_dir, filename))

        # fetch names and values of broadcast variables
        num_broadcast_variables = read_int(infile)
        ser = CompressedSerializer(pickleSer)
        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            if bid >= 0:
                value = ser._read_with_length(infile)
                _broadcastRegistry[bid] = Broadcast(bid, value)
            else:
                bid = - bid - 1
                _broadcastRegistry.remove(bid)

        _accumulatorRegistry.clear()
        command = pickleSer._read_with_length(infile)
        (func, deserializer, serializer) = command
        init_time = time.time()
        iterator = deserializer.load_stream(infile)
        serializer.dump_stream(func(split_index, iterator), outfile)
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc(), outfile)
            outfile.flush()
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print >> sys.stderr, "PySpark worker failed with exception:"
            print >> sys.stderr, traceback.format_exc()
        exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)
Example #5
0
def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            return

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            sys.path.append(os.path.join(spark_files_dir, filename))

        # fetch names and values of broadcast variables
        num_broadcast_variables = read_int(infile)
        ser = CompressedSerializer(pickleSer)
        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            value = ser._read_with_length(infile)
            _broadcastRegistry[bid] = Broadcast(bid, value)

        command = pickleSer._read_with_length(infile)
        (func, deserializer, serializer) = command
        init_time = time.time()
        iterator = deserializer.load_stream(infile)
        serializer.dump_stream(func(split_index, iterator), outfile)
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc(), outfile)
            outfile.flush()
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print >> sys.stderr, "PySpark worker failed with exception:"
            print >> sys.stderr, traceback.format_exc()
        exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)
Example #6
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will
     be sent to each cluster only once.
     """
     ser = CompressedSerializer(PickleSerializer())
     # pass large object by py4j is very slow and need much memory
     tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
     ser.dump_stream([value], tempFile)
     tempFile.close()
     jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name)
     return Broadcast(jbroadcast.id(), None, jbroadcast, self._pickled_broadcast_vars, tempFile.name)
Example #7
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will
     be sent to each cluster only once.
     """
     ser = CompressedSerializer(PickleSerializer())
     # pass large object by py4j is very slow and need much memory
     tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
     ser.dump_stream([value], tempFile)
     tempFile.close()
     jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name)
     return Broadcast(jbroadcast.id(), None, jbroadcast,
                      self._pickled_broadcast_vars, tempFile.name)
Example #8
0
 def _open_file(self):
     dirs = _get_local_dirs("objects")
     d = dirs[id(self) % len(dirs)]
     if not os.path.exists(d):
         os.makedirs(d)
     p = os.path.join(d, str(id(self)))
     self._file = open(p, "w+b", 65536)
     self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
     os.unlink(p)
 def test_compressed_serializer(self):
     ser = CompressedSerializer(PickleSerializer())
     from io import BytesIO as StringIO
     io = StringIO()
     ser.dump_stream(["abc", u"123", range(5)], io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io)))
     ser.dump_stream(range(1000), io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)] + list(range(1000)),
                      list(ser.load_stream(io)))
     io.close()
Example #10
0
 def test_hash_serializer(self):
     hash(NoOpSerializer())
     hash(UTF8Deserializer())
     hash(CPickleSerializer())
     hash(MarshalSerializer())
     hash(AutoSerializer())
     hash(BatchedSerializer(CPickleSerializer()))
     hash(AutoBatchedSerializer(MarshalSerializer()))
     hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CompressedSerializer(CPickleSerializer()))
     hash(FlattenedValuesSerializer(CPickleSerializer()))
Example #11
0
 def test_compressed_serializer(self):
     ser = CompressedSerializer(PickleSerializer())
     try:
         from StringIO import StringIO
     except ImportError:
         from io import BytesIO as StringIO
     io = StringIO()
     ser.dump_stream(["abc", u"123", range(5)], io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io)))
     ser.dump_stream(range(1000), io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io)))
     io.close()
Example #12
0
def _compressed_serializer(self, serializer=None):
    # always use PickleSerializer to simplify implementation
    ser = PickleSerializer()
    return AutoBatchedSerializer(CompressedSerializer(ser))
Example #13
0
def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            exit(-1)

        # initialize global state
        shuffle.MemoryBytesSpilled = 0
        shuffle.DiskBytesSpilled = 0
        _accumulatorRegistry.clear()

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        add_path(
            spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            add_path(os.path.join(spark_files_dir, filename))

        # fetch names and values of broadcast variables
        num_broadcast_variables = read_int(infile)
        ser = CompressedSerializer(pickleSer)
        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            if bid >= 0:
                value = ser._read_with_length(infile)
                _broadcastRegistry[bid] = Broadcast(bid, value)
            else:
                bid = -bid - 1
                _broadcastRegistry.pop(bid)

        _accumulatorRegistry.clear()
        command = pickleSer._read_with_length(infile)
        if isinstance(command, Broadcast):
            command = pickleSer.loads(command.value)
        (func, stats, deserializer, serializer) = command
        init_time = time.time()

        def process():
            iterator = deserializer.load_stream(infile)
            serializer.dump_stream(func(split_index, iterator), outfile)

        if stats:
            p = cProfile.Profile()
            p.runcall(process)
            st = pstats.Stats(p)
            st.stream = None  # make it picklable
            stats.add(st.strip_dirs())
        else:
            process()
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc(), outfile)
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print >> sys.stderr, "PySpark worker failed with exception:"
            print >> sys.stderr, traceback.format_exc()
        exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    write_long(shuffle.MemoryBytesSpilled, outfile)
    write_long(shuffle.DiskBytesSpilled, outfile)

    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)

    # check end of stream
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
        write_int(SpecialLengths.END_OF_STREAM, outfile)
    else:
        # write a different value to tell JVM to not reuse this worker
        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
        exit(-1)