def test_convert_from_binary(): column_foo = ("foo", "TEXT", None, None, 1000, 0, True) column_bar = ("bar", "BINARY", None, None, 1000, 0, True) column_metas = [column_foo, column_bar] expected_val = [] array_len = 1000 string_val = [] for i in range(0, array_len): data = None if bool( random.getrandbits(1)) else generate_random_string() string_val.append(data) expected_val.append(string_val) binary_val = [] for i in range(0, array_len): data = None if bool(random.getrandbits( 1)) else generate_random_string().encode('utf-8') binary_val.append(data) expected_val.append(string_val) rb = RecordBatch.from_arrays( [pyarrow.array(expected_val[0]), pyarrow.array(expected_val[1])], ['col_foo', 'col_bar']) for i, col_array in enumerate(rb): converter = ColumnConverter(col_array, column_metas[i]) for j in range(0, array_len): py_val = converter.to_python_native(j) assert py_val == expected_val[i][j]
def test_iterate_over_int64_chunk(): stream = BytesIO() field_foo = pyarrow.field("column_foo", pyarrow.int64(), True) field_bar = pyarrow.field("column_bar", pyarrow.int64(), True) schema = pyarrow.schema([field_foo, field_bar]) column_meta = [("column_foo", "FIXED", None, 0, 0, 0, 0), ("column_bar", "FIXED", None, 0, 0, 0, 0)] column_size = 2 batch_row_count = 10 batch_count = 10 expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] for k in range(batch_row_count): data = None if bool(random.getrandbits(1)) else random.randint( -100, 100) column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data)) expected_data.append(column_arrays) rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"]) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) reader = RecordBatchStreamReader(stream) it = PyArrowChunkIterator() for rb in reader: it.add_record_batch(rb) count = 0 while True: try: val = next(it) assert val[0] == expected_data[int(count / 10)][0][count % 10] assert val[1] == expected_data[int(count / 10)][1][count % 10] count += 1 except StopIteration: assert count == 100 break
def generate_data(pyarrow_type, column_meta, source_data_generator, batch_count, batch_row_count): stream = BytesIO() assert len(pyarrow_type) == len(column_meta) column_size = len(pyarrow_type) fields = [] for i in range(column_size): fields.append( pyarrow.field("column_{}".format(i), pyarrow_type[i], True, column_meta[i])) schema = pyarrow.schema(fields) expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for _ in range(batch_row_count): data = (None if bool(random.getrandbits(1)) else source_data_generator()) if data is not None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j])) expected_data.append(column_arrays) column_names = ["column_{}".format(i) for i in range(column_size)] rb = RecordBatch.from_arrays(py_arrays, column_names) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) return stream, expected_data
def test_convert_from_fixed(): column_foo = ("foo", "FIXED", None, None, 1000, 0, True) expected_val = [] array_len = 1000 for i in range(0, array_len): data = None if bool(random.getrandbits(1)) else random.randint( -1000, 1000) expected_val.append(data) rb = RecordBatch.from_arrays([pyarrow.array(expected_val)], ['column_foo']) for col_array in rb: converter = FixedColumnConverter(col_array, column_foo) for i in range(0, array_len): py_val = converter.to_python_native(i) assert py_val == expected_val[i]
def iterate_over_test_chunk(pyarrow_type, column_meta, source_data_generator, expected_data_transformer=None): stream = BytesIO() assert len(pyarrow_type) == len(column_meta) column_size = len(pyarrow_type) batch_row_count = 10 batch_count = 9 fields = [] for i in range(column_size): fields.append( pyarrow.field("column_{}".format(i), pyarrow_type[i], True, column_meta[i])) schema = pyarrow.schema(fields) expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for _ in range(batch_row_count): data = None if bool( random.getrandbits(1)) else source_data_generator() if data is not None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data) py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j])) if expected_data_transformer: for i in range(len(column_arrays)): column_arrays[i] = [ expected_data_transformer(_data) if _data is not None else None for _data in column_arrays[i] ] expected_data.append(column_arrays) column_names = ["column_{}".format(i) for i in range(column_size)] rb = RecordBatch.from_arrays(py_arrays, column_names) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) context = ArrowConverterContext() it = PyArrowIterator(None, stream, context, False, False) it.init(ROW_UNIT) count = 0 while True: try: val = next(it) for i in range(column_size): batch_index = int(count / batch_row_count) assert val[i] == expected_data[batch_index][i][ count - batch_row_count * batch_index] count += 1 except StopIteration: assert count == (batch_count * batch_row_count) break
def test_iterate_over_decimal_chunk(): # TODO: to add more test case to cover as much code as possible # e.g. Decimal(19, 0) for Int64, Decimal(9, 0) for Int32, Decimal(4, 0) for Int16, Decimal(2, 0) for Int8 def get_random_decimal(precision, scale): data = [] for i in range(precision): data.append(str(random.randint(1,9))) if scale: data.insert(-scale, '.') return decimal.Decimal("".join(data)) stream = BytesIO() column_meta = [ { "logicalType" : "FIXED", "precision" : "10", "scale" : "3" }, { "logicalType" : "FIXED", "precision" : "38", "scale" : "0" } ] field_foo = pyarrow.field("column_foo", pyarrow.decimal128(10, 3), True, column_meta[0]) field_bar = pyarrow.field("column_bar", pyarrow.decimal128(38, 0), True, column_meta[1]) schema = pyarrow.schema([field_foo, field_bar]) column_size = 2 batch_row_count = 10 batch_count = 10 expected_data = [] writer = RecordBatchStreamWriter(stream, schema) for i in range(batch_count): column_arrays = [] py_arrays = [] for j in range(column_size): column_data = [] not_none_cnt = 0 while not_none_cnt == 0: column_data = [] for k in range(batch_row_count): data = None if bool(random.getrandbits(1)) else get_random_decimal(10 if j % 2 == 0 else 38, 3 if j % 2 == 0 else 0) if data != None: not_none_cnt += 1 column_data.append(data) column_arrays.append(column_data if j % 2 == 0 else [int(data) if data is not None else None for data in column_data]) py_arrays.append(pyarrow.array(column_data)) expected_data.append(column_arrays) rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"]) writer.write_batch(rb) writer.close() # seek stream to begnning so that we can read from stream stream.seek(0) reader = RecordBatchStreamReader(stream) context = ArrowConverterContext() it = PyArrowChunkIterator(reader, context) count = 0 while True: try: val = next(it) assert val[0] == expected_data[int(count / 10)][0][count % 10] assert type(val[0]) == type(expected_data[int(count / 10)][0][count % 10]) # Decimal type or NoneType assert val[1] == expected_data[int(count / 10)][1][count % 10] assert type(val[1]) == type(expected_data[int(count / 10)][1][count % 10]) # Int type or NoneType count += 1 except StopIteration: assert count == 100 break