Ejemplo n.º 1
0
    def test_store_pandas_dataframe(self):
        import pyarrow.plasma as plasma
        d = {
            'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
            'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])
        }
        df = pd.DataFrame(d)

        # Write the DataFrame.
        record_batch = pa.RecordBatch.from_pandas(df)
        # Determine the size.
        s = pa.MockOutputStream()
        stream_writer = pa.RecordBatchStreamWriter(s, record_batch.schema)
        stream_writer.write_batch(record_batch)
        data_size = s.size()
        object_id = plasma.ObjectID(np.random.bytes(20))

        buf = self.plasma_client.create(object_id, data_size)
        stream = pa.FixedSizeBufferOutputStream(buf)
        stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema)
        stream_writer.write_batch(record_batch)

        self.plasma_client.seal(object_id)

        # Read the DataFrame.
        [data] = self.plasma_client.get_buffers([object_id])
        reader = pa.RecordBatchStreamReader(pa.BufferReader(data))
        result = reader.get_next_batch().to_pandas()

        pd.util.testing.assert_frame_equal(df, result)
Ejemplo n.º 2
0
 def test_store_arrow_objects(self):
     data = np.random.randn(10, 4)
     # Write an arrow object.
     object_id = random_object_id()
     tensor = pa.Tensor.from_numpy(data)
     data_size = pa.get_tensor_size(tensor)
     buf = self.plasma_client.create(object_id, data_size)
     stream = pa.FixedSizeBufferOutputStream(buf)
     pa.write_tensor(tensor, stream)
     self.plasma_client.seal(object_id)
     # Read the arrow object.
     [tensor] = self.plasma_client.get_buffers([object_id])
     reader = pa.BufferReader(tensor)
     array = pa.read_tensor(reader).to_numpy()
     # Assert that they are equal.
     np.testing.assert_equal(data, array)
Ejemplo n.º 3
0
def put_df(df):
    record_batch = pa.RecordBatch.from_pandas(df)

    # Get size of record batch and schema
    mock_sink = pa.MockOutputStream()
    stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
    stream_writer.write_batch(record_batch)
    data_size = mock_sink.size()

    # Generate an ID and allocate a buffer in the object store for the
    # serialized DataFrame
    object_id = plasma.ObjectID(np.random.bytes(20))
    buf = client.create(object_id, data_size)

    # Write the serialized DataFrame to the object store
    sink = pa.FixedSizeBufferOutputStream(buf)
    stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema)
    stream_writer.write_batch(record_batch)

    # Seal the object
    client.seal(object_id)

    return object_id