def test_get_record_batch_size(): N = 10 itemsize = 8 df = pd.DataFrame({'foo': np.random.randn(N)}) batch = pa.RecordBatch.from_pandas(df) assert pa.get_record_batch_size(batch) > (N * itemsize)
def test_get_record_batch_size(): N = 10 itemsize = 8 df = pd.DataFrame({'foo': np.random.randn(N)}) batch = pa.RecordBatch.from_pandas(df) assert pa.get_record_batch_size(batch) > (N * itemsize)
def test_store_pandas_dataframe(self): d = { 'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd']) } df = pd.DataFrame(d) # Write the DataFrame. record_batch = pa.RecordBatch.from_pandas(df) data_size = pa.get_record_batch_size(record_batch) object_id = plasma.ObjectID(np.random.bytes(20)) buf = self.plasma_client.create(object_id, data_size) stream = plasma.FixedSizeBufferOutputStream(buf) stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) stream_writer.write_batch(record_batch) self.plasma_client.seal(object_id) # Read the DataFrame. [data] = self.plasma_client.get([object_id]) reader = pa.RecordBatchStreamReader(pa.BufferReader(data)) result = reader.read_next_batch().to_pandas() pd.util.testing.assert_frame_equal(df, result)
def write_recordbatchfile(self): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, self.pa_schema) batches_size = 0 while (batches_size // 1024**2) < self.maxfilesize: batch = self.write_batch_arrow() batches_size += pa.get_record_batch_size(batch) writer.write_batch(batch) if self.checkcount(): break writer.close() buf = sink.getvalue() return buf