Esempio n. 1
0
def test_get_record_batch_size():
    N = 10
    itemsize = 8
    df = pd.DataFrame({'foo': np.random.randn(N)})

    batch = pa.RecordBatch.from_pandas(df)
    assert pa.get_record_batch_size(batch) > (N * itemsize)
Esempio n. 2
0
def test_get_record_batch_size():
    N = 10
    itemsize = 8
    df = pd.DataFrame({'foo': np.random.randn(N)})

    batch = pa.RecordBatch.from_pandas(df)
    assert pa.get_record_batch_size(batch) > (N * itemsize)
Esempio n. 3
0
    def test_store_pandas_dataframe(self):
        d = {
            'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
            'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])
        }
        df = pd.DataFrame(d)

        # Write the DataFrame.
        record_batch = pa.RecordBatch.from_pandas(df)
        data_size = pa.get_record_batch_size(record_batch)
        object_id = plasma.ObjectID(np.random.bytes(20))

        buf = self.plasma_client.create(object_id, data_size)
        stream = plasma.FixedSizeBufferOutputStream(buf)
        stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema)
        stream_writer.write_batch(record_batch)

        self.plasma_client.seal(object_id)

        # Read the DataFrame.
        [data] = self.plasma_client.get([object_id])
        reader = pa.RecordBatchStreamReader(pa.BufferReader(data))
        result = reader.read_next_batch().to_pandas()

        pd.util.testing.assert_frame_equal(df, result)
Esempio n. 4
0
    def write_recordbatchfile(self):
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, self.pa_schema)

        batches_size = 0
        while (batches_size // 1024**2) < self.maxfilesize:
            batch = self.write_batch_arrow()
            batches_size += pa.get_record_batch_size(batch)
            writer.write_batch(batch)
            if self.checkcount():
                break

        writer.close()
        buf = sink.getvalue()
        return buf