Esempio n. 1
0
 def _serialize_to_jvm(self, data, parallelism, serializer):
     """
     Using py4j to send a large dataset to the jvm is really slow, so we use either a file
     or a socket if we have encryption enabled.
     """
     if self._encryption_enabled:
         # with encryption, we open a server in java and send the data directly
         server = self._jvm.PythonParallelizeServer(self._jsc.sc(), parallelism)
         (sock_file, _) = local_connect_and_auth(server.port(), server.secret())
         chunked_out = ChunkedStream(sock_file, 8192)
         serializer.dump_stream(data, chunked_out)
         chunked_out.close()
         # this call will block until the server has read all the data and processed it (or
         # throws an exception)
         return server.getResult()
     else:
         # without encryption, we serialize to a file, and we read the file in java and
         # parallelize from there.
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
         try:
             serializer.dump_stream(data, tempFile)
             tempFile.close()
             readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
             return readRDDFromFile(self._jsc, tempFile.name, parallelism)
         finally:
             # we eagerly read the file so we can delete right after.
             os.unlink(tempFile.name)
Esempio n. 2
0
 def _test_chunked_stream(self, data, py_buf_size):
     # write data using the chunked protocol from python.
     chunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file.close()
     try:
         out = ChunkedStream(chunked_file, py_buf_size)
         out.write(data)
         out.close()
         # now try to read it in java
         jin = self._jvm.java.io.FileInputStream(chunked_file.name)
         jout = self._jvm.java.io.FileOutputStream(dechunked_file.name)
         self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout)
         # java should have decoded it back to the original data
         self.assertEqual(len(data), os.stat(dechunked_file.name).st_size)
         with open(dechunked_file.name, "rb") as f:
             byte = f.read(1)
             idx = 0
             while byte:
                 self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx))
                 byte = f.read(1)
                 idx += 1
     finally:
         os.unlink(chunked_file.name)
         os.unlink(dechunked_file.name)
Esempio n. 3
0
 def _test_chunked_stream(self, data, py_buf_size):
     # write data using the chunked protocol from python.
     chunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file.close()
     try:
         out = ChunkedStream(chunked_file, py_buf_size)
         out.write(data)
         out.close()
         # now try to read it in java
         jin = self._jvm.java.io.FileInputStream(chunked_file.name)
         jout = self._jvm.java.io.FileOutputStream(dechunked_file.name)
         self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout)
         # java should have decoded it back to the original data
         self.assertEqual(len(data), os.stat(dechunked_file.name).st_size)
         with open(dechunked_file.name, "rb") as f:
             byte = f.read(1)
             idx = 0
             while byte:
                 self.assertEqual(data[idx],
                                  bytearray(byte)[0],
                                  msg="idx = " + str(idx))
                 byte = f.read(1)
                 idx += 1
     finally:
         os.unlink(chunked_file.name)
         os.unlink(dechunked_file.name)
Esempio n. 4
0
 def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer):
     """
     Using py4j to send a large dataset to the jvm is really slow, so we use either a file
     or a socket if we have encryption enabled.
     :param data:
     :param serializer:
     :param reader_func:  A function which takes a filename and reads in the data in the jvm and
             returns a JavaRDD. Only used when encryption is disabled.
     :param createRDDServer:  A function which creates a PythonRDDServer in the jvm to
            accept the serialized data, for use when encryption is enabled.
     :return:
     """
     if self._encryption_enabled:
         # with encryption, we open a server in java and send the data directly
         server = createRDDServer()
         (sock_file, _) = local_connect_and_auth(server.port(), server.secret())
         chunked_out = ChunkedStream(sock_file, 8192)
         serializer.dump_stream(data, chunked_out)
         chunked_out.close()
         # this call will block until the server has read all the data and processed it (or
         # throws an exception)
         r = server.getResult()
         return r
     else:
         # without encryption, we serialize to a file, and we read the file in java and
         # parallelize from there.
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
         try:
             serializer.dump_stream(data, tempFile)
             tempFile.close()
             return reader_func(tempFile.name)
         finally:
             # we eagerily reads the file so we can delete right after.
             os.unlink(tempFile.name)
Esempio n. 5
0
 def _serialize_to_jvm(self, data, serializer, reader_func,
                       createRDDServer):
     """
     Using py4j to send a large dataset to the jvm is really slow, so we use either a file
     or a socket if we have encryption enabled.
     :param data:
     :param serializer:
     :param reader_func:  A function which takes a filename and reads in the data in the jvm and
             returns a JavaRDD. Only used when encryption is disabled.
     :param createRDDServer:  A function which creates a PythonRDDServer in the jvm to
            accept the serialized data, for use when encryption is enabled.
     :return:
     """
     if self._encryption_enabled:
         # with encryption, we open a server in java and send the data directly
         server = createRDDServer()
         (sock_file, _) = local_connect_and_auth(server.port(),
                                                 server.secret())
         chunked_out = ChunkedStream(sock_file, 8192)
         serializer.dump_stream(data, chunked_out)
         chunked_out.close()
         # this call will block until the server has read all the data and processed it (or
         # throws an exception)
         r = server.getResult()
         return r
     else:
         # without encryption, we serialize to a file, and we read the file in java and
         # parallelize from there.
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
         try:
             try:
                 serializer.dump_stream(data, tempFile)
             finally:
                 tempFile.close()
             return reader_func(tempFile.name)
         finally:
             # we eagerily reads the file so we can delete right after.
             os.unlink(tempFile.name)