def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): """ Using py4j to send a large dataset to the jvm is really slow, so we use either a file or a socket if we have encryption enabled. :param data: :param serializer: :param reader_func: A function which takes a filename and reads in the data in the jvm and returns a JavaRDD. Only used when encryption is disabled. :param createRDDServer: A function which creates a PythonRDDServer in the jvm to accept the serialized data, for use when encryption is enabled. :return: """ if self._encryption_enabled: # with encryption, we open a server in java and send the data directly server = createRDDServer() (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) chunked_out = ChunkedStream(sock_file, 8192) serializer.dump_stream(data, chunked_out) chunked_out.close() # this call will block until the server has read all the data and processed it (or # throws an exception) r = server.getResult() return r else: # without encryption, we serialize to a file, and we read the file in java and # parallelize from there. tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: serializer.dump_stream(data, tempFile) tempFile.close() return reader_func(tempFile.name) finally: # we eagerily reads the file so we can delete right after. os.unlink(tempFile.name)
def _test_chunked_stream(self, data, py_buf_size): # write data using the chunked protocol from python. chunked_file = tempfile.NamedTemporaryFile(delete=False) dechunked_file = tempfile.NamedTemporaryFile(delete=False) dechunked_file.close() try: out = ChunkedStream(chunked_file, py_buf_size) out.write(data) out.close() # now try to read it in java jin = self._jvm.java.io.FileInputStream(chunked_file.name) jout = self._jvm.java.io.FileOutputStream(dechunked_file.name) self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout) # java should have decoded it back to the original data self.assertEqual(len(data), os.stat(dechunked_file.name).st_size) with open(dechunked_file.name, "rb") as f: byte = f.read(1) idx = 0 while byte: self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx)) byte = f.read(1) idx += 1 finally: os.unlink(chunked_file.name) os.unlink(dechunked_file.name)
def __init__(self, sc=None, value=None, pickle_registry=None, path=None, sock_file=None): """ Should not be called directly by users -- use L{SparkContext.broadcast()} instead. """ if sc is not None: # we're on the driver. We want the pickled data to end up in a file (maybe encrypted) f = NamedTemporaryFile(delete=False, dir=sc._temp_dir) self._path = f.name self._sc = sc self._python_broadcast = sc._jvm.PythonRDD.setupBroadcast( self._path) if sc._encryption_enabled: # with encryption, we ask the jvm to do the encryption for us, we send it data # over a socket port, auth_secret = self._python_broadcast.setupEncryptionServer( ) (encryption_sock_file, _) = local_connect_and_auth(port, auth_secret) broadcast_out = ChunkedStream(encryption_sock_file, 8192) else: # no encryption, we can just write pickled data directly to the file from python broadcast_out = f self.dump(value, broadcast_out) if sc._encryption_enabled: self._python_broadcast.waitTillDataReceived() self._jbroadcast = sc._jsc.broadcast(self._python_broadcast) self._pickle_registry = pickle_registry else: # we're on an executor self._jbroadcast = None self._sc = None self._python_broadcast = None if sock_file is not None: # the jvm is doing decryption for us. Read the value # immediately from the sock_file self._value = self.load(sock_file) else: # the jvm just dumps the pickled data in path -- we'll unpickle lazily when # the value is requested assert (path is not None) self._path = path