def store_dataframe(self, data): record_batch = RecordBatch.from_pandas(data) object_id = plasma.ObjectID(np.random.bytes(20)) mock_sink = MockOutputStream() with RecordBatchStreamWriter(mock_sink, record_batch.schema) as stream_writer: stream_writer.write_batch(record_batch) data_size = mock_sink.size() buf = self.client.create(object_id, data_size) stream = FixedSizeBufferWriter(buf) with RecordBatchStreamWriter(stream, record_batch.schema) as stream_writer: stream_writer.write_batch(record_batch) self.client.seal(object_id) return object_id
def test_iterate_over_int64_chunk(): stream = BytesIO() field_foo = pyarrow.field("column_foo", pyarrow.int64(), True) field_bar = pyarrow.field("column_bar", pyarrow.int64(), True) schema = pyarrow.schema([field_foo, field_bar]) column_meta = [("column_foo", "FIXED", None, 0, 0, 0, 0), ("column_bar", "FIXED", None, 0, 0, 0, 0)] column_size = 2 batch_row_count = 10 batch_count = 10 expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] for k in range(batch_row_count): data = None if bool(random.getrandbits(1)) else random.randint( -100, 100) column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data)) expected_data.append(column_arrays) rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"]) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) reader = RecordBatchStreamReader(stream) it = PyArrowChunkIterator() for rb in reader: it.add_record_batch(rb) count = 0 while True: try: val = next(it) assert val[0] == expected_data[int(count / 10)][0][count % 10] assert val[1] == expected_data[int(count / 10)][1][count % 10] count += 1 except StopIteration: assert count == 100 break
def generate_data(pyarrow_type, column_meta, source_data_generator, batch_count, batch_row_count): stream = BytesIO() assert len(pyarrow_type) == len(column_meta) column_size = len(pyarrow_type) fields = [] for i in range(column_size): fields.append( pyarrow.field("column_{}".format(i), pyarrow_type[i], True, column_meta[i])) schema = pyarrow.schema(fields) expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for _ in range(batch_row_count): data = (None if bool(random.getrandbits(1)) else source_data_generator()) if data is not None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j])) expected_data.append(column_arrays) column_names = ["column_{}".format(i) for i in range(column_size)] rb = RecordBatch.from_arrays(py_arrays, column_names) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) return stream, expected_data
def iterate_over_test_chunk(pyarrow_type, column_meta, source_data_generator, expected_data_transformer=None): stream = BytesIO() assert len(pyarrow_type) == len(column_meta) column_size = len(pyarrow_type) batch_row_count = 10 batch_count = 9 fields = [] for i in range(column_size): fields.append( pyarrow.field("column_{}".format(i), pyarrow_type[i], True, column_meta[i])) schema = pyarrow.schema(fields) expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for _ in range(batch_row_count): data = None if bool( random.getrandbits(1)) else source_data_generator() if data is not None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j])) if expected_data_transformer: for i in range(len(column_arrays)): column_arrays[i] = [ expected_data_transformer(_data) if _data is not None else None for _data in column_arrays[i] ] expected_data.append(column_arrays) column_names = ["column_{}".format(i) for i in range(column_size)] rb = RecordBatch.from_arrays(py_arrays, column_names) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) context = ArrowConverterContext() it = PyArrowIterator(None, stream, context, False, False) it.init(ROW_UNIT) count = 0 while True: try: val = next(it) for i in range(column_size): batch_index = int(count / batch_row_count) assert val[i] == expected_data[batch_index][i][ count - batch_row_count * batch_index] count += 1 except StopIteration: assert count == (batch_count * batch_row_count) break
def test_iterate_over_decimal_chunk(): # TODO: to add more test case to cover as much code as possible # e.g. Decimal(19, 0) for Int64, Decimal(9, 0) for Int32, Decimal(4, 0) for Int16, Decimal(2, 0) for Int8 def get_random_decimal(precision, scale): data = [] for i in range(precision): data.append(str(random.randint(1,9))) if scale: data.insert(-scale, '.') return decimal.Decimal("".join(data)) stream = BytesIO() column_meta = [ { "logicalType" : "FIXED", "precision" : "10", "scale" : "3" }, { "logicalType" : "FIXED", "precision" : "38", "scale" : "0" } ] field_foo = pyarrow.field("column_foo", pyarrow.decimal128(10, 3), True, column_meta[0]) field_bar = pyarrow.field("column_bar", pyarrow.decimal128(38, 0), True, column_meta[1]) schema = pyarrow.schema([field_foo, field_bar]) column_size = 2 batch_row_count = 10 batch_count = 10 expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for k in range(batch_row_count): data = None if bool(random.getrandbits(1)) else get_random_decimal(10 if j % 2 == 0 else 38, 3 if j % 2 == 0 else 0) if data != None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data if j % 2 == 0 else [int(data) if data is not None else None for data in column_data]) py_arrays.append(pyarrow.array(column_data)) expected_data.append(column_arrays) rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"]) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) reader = RecordBatchStreamReader(stream) context = ArrowConverterContext() it = PyArrowChunkIterator(reader, context) count = 0 while True: try: val = next(it) assert val[0] == expected_data[int(count / 10)][0][count % 10] assert type(val[0]) == type(expected_data[int(count / 10)][0][count % 10]) # Decimal type or NoneType assert val[1] == expected_data[int(count / 10)][1][count % 10] assert type(val[1]) == type(expected_data[int(count / 10)][1][count % 10]) # Int type or NoneType count += 1 except StopIteration: assert count == 100 break