Ejemplo n.º 1
0
    def store_dataframe(self, data):
        record_batch = RecordBatch.from_pandas(data)
        object_id = plasma.ObjectID(np.random.bytes(20))
        mock_sink = MockOutputStream()
        with RecordBatchStreamWriter(mock_sink,
                                     record_batch.schema) as stream_writer:
            stream_writer.write_batch(record_batch)
        data_size = mock_sink.size()
        buf = self.client.create(object_id, data_size)

        stream = FixedSizeBufferWriter(buf)
        with RecordBatchStreamWriter(stream,
                                     record_batch.schema) as stream_writer:
            stream_writer.write_batch(record_batch)
        self.client.seal(object_id)
        return object_id
Ejemplo n.º 2
0
def test_iterate_over_int64_chunk():
    stream = BytesIO()
    field_foo = pyarrow.field("column_foo", pyarrow.int64(), True)
    field_bar = pyarrow.field("column_bar", pyarrow.int64(), True)
    schema = pyarrow.schema([field_foo, field_bar])
    column_meta = [("column_foo", "FIXED", None, 0, 0, 0, 0),
                   ("column_bar", "FIXED", None, 0, 0, 0, 0)]

    column_size = 2
    batch_row_count = 10
    batch_count = 10
    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            for k in range(batch_row_count):
                data = None if bool(random.getrandbits(1)) else random.randint(
                    -100, 100)
                column_data.append(data)
            column_arrays.append(column_data)
            py_arrays.append(pyarrow.array(column_data))

        expected_data.append(column_arrays)
        rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"])
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    reader = RecordBatchStreamReader(stream)
    it = PyArrowChunkIterator()
    for rb in reader:
        it.add_record_batch(rb)

    count = 0
    while True:
        try:
            val = next(it)
            assert val[0] == expected_data[int(count / 10)][0][count % 10]
            assert val[1] == expected_data[int(count / 10)][1][count % 10]
            count += 1
        except StopIteration:
            assert count == 100
            break
Ejemplo n.º 3
0
def generate_data(pyarrow_type, column_meta, source_data_generator,
                  batch_count, batch_row_count):
    stream = BytesIO()

    assert len(pyarrow_type) == len(column_meta)

    column_size = len(pyarrow_type)
    fields = []
    for i in range(column_size):
        fields.append(
            pyarrow.field("column_{}".format(i), pyarrow_type[i], True,
                          column_meta[i]))
    schema = pyarrow.schema(fields)

    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            not_none_cnt = 0
            while not_none_cnt == 0:
                column_data = []
                for _ in range(batch_row_count):
                    data = (None if bool(random.getrandbits(1)) else
                            source_data_generator())
                    if data is not None:
                        not_none_cnt += 1
                    column_data.append(data)
            column_arrays.append(column_data)
            py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j]))

        expected_data.append(column_arrays)

        column_names = ["column_{}".format(i) for i in range(column_size)]
        rb = RecordBatch.from_arrays(py_arrays, column_names)
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)

    return stream, expected_data
Ejemplo n.º 4
0
def iterate_over_test_chunk(pyarrow_type,
                            column_meta,
                            source_data_generator,
                            expected_data_transformer=None):
    stream = BytesIO()

    assert len(pyarrow_type) == len(column_meta)

    column_size = len(pyarrow_type)
    batch_row_count = 10
    batch_count = 9

    fields = []
    for i in range(column_size):
        fields.append(
            pyarrow.field("column_{}".format(i), pyarrow_type[i], True,
                          column_meta[i]))
    schema = pyarrow.schema(fields)

    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            not_none_cnt = 0
            while not_none_cnt == 0:
                column_data = []
                for _ in range(batch_row_count):
                    data = None if bool(
                        random.getrandbits(1)) else source_data_generator()
                    if data is not None:
                        not_none_cnt += 1
                    column_data.append(data)
            column_arrays.append(column_data)
            py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j]))

        if expected_data_transformer:
            for i in range(len(column_arrays)):
                column_arrays[i] = [
                    expected_data_transformer(_data)
                    if _data is not None else None
                    for _data in column_arrays[i]
                ]
        expected_data.append(column_arrays)

        column_names = ["column_{}".format(i) for i in range(column_size)]
        rb = RecordBatch.from_arrays(py_arrays, column_names)
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    context = ArrowConverterContext()
    it = PyArrowIterator(None, stream, context, False, False)
    it.init(ROW_UNIT)

    count = 0
    while True:
        try:
            val = next(it)
            for i in range(column_size):
                batch_index = int(count / batch_row_count)
                assert val[i] == expected_data[batch_index][i][
                    count - batch_row_count * batch_index]
            count += 1
        except StopIteration:
            assert count == (batch_count * batch_row_count)
            break
Ejemplo n.º 5
0
def test_iterate_over_decimal_chunk():
    # TODO: to add more test case to cover as much code as possible
    #       e.g. Decimal(19, 0) for Int64, Decimal(9, 0) for Int32, Decimal(4, 0) for Int16, Decimal(2, 0) for Int8

    def get_random_decimal(precision, scale):
        data = []
        for i in range(precision):
            data.append(str(random.randint(1,9)))

        if scale:
            data.insert(-scale, '.')
        return decimal.Decimal("".join(data))

    stream = BytesIO()
    column_meta = [
            { "logicalType" : "FIXED", "precision" : "10", "scale" : "3" },
            { "logicalType" : "FIXED", "precision" : "38", "scale" : "0" }
    ]
    field_foo = pyarrow.field("column_foo", pyarrow.decimal128(10, 3), True, column_meta[0])
    field_bar = pyarrow.field("column_bar", pyarrow.decimal128(38, 0), True, column_meta[1])
    schema = pyarrow.schema([field_foo, field_bar])

    column_size = 2
    batch_row_count = 10
    batch_count = 10
    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            not_none_cnt = 0
            while not_none_cnt == 0:
                column_data = []
                for k in range(batch_row_count):
                    data = None if bool(random.getrandbits(1)) else get_random_decimal(10 if j % 2 == 0 else 38, 3 if j % 2 == 0 else 0)
                    if data != None:
                        not_none_cnt += 1
                    column_data.append(data)
            column_arrays.append(column_data if j % 2 == 0 else [int(data) if data is not None else None for data in column_data])
            py_arrays.append(pyarrow.array(column_data))

        expected_data.append(column_arrays)
        rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"])
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    reader = RecordBatchStreamReader(stream)
    context = ArrowConverterContext()
    it = PyArrowChunkIterator(reader, context)

    count = 0
    while True:
        try:
            val = next(it)
            assert val[0] == expected_data[int(count / 10)][0][count % 10]
            assert type(val[0]) == type(expected_data[int(count / 10)][0][count % 10])  # Decimal type or NoneType
            assert val[1] == expected_data[int(count / 10)][1][count % 10]
            assert type(val[1]) == type(expected_data[int(count / 10)][1][count % 10])  # Int type or NoneType
            count += 1
        except StopIteration:
            assert count == 100
            break