def test_store_pandas_dataframe(self): import pyarrow.plasma as plasma d = { 'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd']) } df = pd.DataFrame(d) # Write the DataFrame. record_batch = pa.RecordBatch.from_pandas(df) # Determine the size. s = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(s, record_batch.schema) stream_writer.write_batch(record_batch) data_size = s.size() object_id = plasma.ObjectID(np.random.bytes(20)) buf = self.plasma_client.create(object_id, data_size) stream = pa.FixedSizeBufferOutputStream(buf) stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) stream_writer.write_batch(record_batch) self.plasma_client.seal(object_id) # Read the DataFrame. [data] = self.plasma_client.get_buffers([object_id]) reader = pa.RecordBatchStreamReader(pa.BufferReader(data)) result = reader.get_next_batch().to_pandas() pd.util.testing.assert_frame_equal(df, result)
def test_store_arrow_objects(self): data = np.random.randn(10, 4) # Write an arrow object. object_id = random_object_id() tensor = pa.Tensor.from_numpy(data) data_size = pa.get_tensor_size(tensor) buf = self.plasma_client.create(object_id, data_size) stream = pa.FixedSizeBufferOutputStream(buf) pa.write_tensor(tensor, stream) self.plasma_client.seal(object_id) # Read the arrow object. [tensor] = self.plasma_client.get_buffers([object_id]) reader = pa.BufferReader(tensor) array = pa.read_tensor(reader).to_numpy() # Assert that they are equal. np.testing.assert_equal(data, array)
def put_df(df): record_batch = pa.RecordBatch.from_pandas(df) # Get size of record batch and schema mock_sink = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) stream_writer.write_batch(record_batch) data_size = mock_sink.size() # Generate an ID and allocate a buffer in the object store for the # serialized DataFrame object_id = plasma.ObjectID(np.random.bytes(20)) buf = client.create(object_id, data_size) # Write the serialized DataFrame to the object store sink = pa.FixedSizeBufferOutputStream(buf) stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema) stream_writer.write_batch(record_batch) # Seal the object client.seal(object_id) return object_id