Esempio n. 1
0
 def time_write_binary_table_uncompressed(self):
     out = pa.BufferOutputStream()
     pq.write_table(self.table, out, compression='none')
Esempio n. 2
0
def _serialize_pyarrow_table(table):
    output_stream = pa.BufferOutputStream()
    with pa.RecordBatchStreamWriter(output_stream, schema=table.schema) as wr:
        wr.write_table(table)
    return output_stream.getvalue()  # This will also close the stream.
Esempio n. 3
0
def _simple_table_write_read(table, use_legacy_dataset):
    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()
    return pq.read_table(pa.BufferReader(contents),
                         use_legacy_dataset=use_legacy_dataset)
def _schema2bytes(schema: SchemaWrapper) -> bytes:
    buf = pa.BufferOutputStream()
    pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us")
    return buf.getvalue().to_pybytes()
def test_empty_file():
    buf = b''
    with pytest.raises(pa.ArrowInvalid):
        pa.open_file(pa.BufferReader(buf))


def test_file_simple_roundtrip(file_fixture):
    file_fixture._check_roundtrip(as_table=False)


def test_file_write_table(file_fixture):
    file_fixture._check_roundtrip(as_table=True)


@pytest.mark.parametrize(
    "sink_factory", [lambda: io.BytesIO(), lambda: pa.BufferOutputStream()])
def test_file_read_all(sink_factory):
    fixture = FileFormatFixture(sink_factory)

    _, batches = fixture.write_batches()
    file_contents = pa.BufferReader(fixture.get_source())

    reader = pa.open_file(file_contents)

    result = reader.read_all()
    expected = pa.Table.from_batches(batches)
    assert result.equals(expected)


def test_open_file_from_buffer(file_fixture):
    # ARROW-2859; APIs accept the buffer protocol
Esempio n. 6
0
def make_serialized(schema, batches):
    with pa.BufferOutputStream() as sink:
        with pa.ipc.new_stream(sink, schema) as out:
            for batch in batches:
                out.write(batch)
        return sink.getvalue()
Esempio n. 7
0
 def open_output_stream(self, path, metadata):
     if "notfound" in path:
         raise FileNotFoundError(path)
     return pa.BufferOutputStream()
Esempio n. 8
0
    def test_validation(self):
        print("Simulate production")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            # Following puts the menu and config to the datastore
            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)

            # Multiple streams
            store.new_partition(dataset.uuid, "key1")
            store.new_partition(dataset.uuid, "key2")
            store.new_partition(dataset.uuid, "key3")

            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            ids_ = []
            parts = store.list_partitions(dataset.uuid)
            # reload menu and config
            newmenu = Menu_pb()
            store.get(menu_uuid, newmenu)
            newconfig = Configuration()
            store.get(config_uuid, newconfig)
            print(parts)
            for _ in range(10):
                job_id = store.new_job(dataset.uuid)

                for key in parts:
                    ids_.append(
                        store.register_content(
                            buf,
                            fileinfo,
                            dataset_id=dataset.uuid,
                            job_id=job_id,
                            partition_key=key,
                        ).uuid)
                    store.put(ids_[-1], buf)

            for id_ in ids_:
                buf = pa.py_buffer(store.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            # Save the store, reload
            store.save_store()
            newstore = BaseObjectStore(str(_path),
                                       store._name,
                                       store_uuid=store.store_uuid)
            for id_ in ids_:
                print("Get object %s", id_)
                print(type(id_))
                buf = pa.py_buffer(newstore.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)
            print(newmenu)
            print(newconfig)
            print("Simulation Test Done ===========================")
Esempio n. 9
0
    def write_branches_to_arrow(self, transformer, topic_name, file_id,
                                request_id):
        from .scratch_file_writer import ScratchFileWriter

        tick = time.time()
        scratch_writer = None
        total_messages = 0

        for pa_table in transformer.arrow_table():
            if self.object_store:
                if not scratch_writer:
                    scratch_writer = ScratchFileWriter(
                        file_format=self.file_format)
                    scratch_writer.open_scratch_file(pa_table)

                scratch_writer.append_table_to_scratch(pa_table)

            if self.messaging:
                batches = pa_table.to_batches(
                    max_chunksize=transformer.chunk_size)

                for batch in batches:
                    messaging_tick = time.time()

                    # Just need to make key unique to shard messages across brokers
                    key = str.encode(transformer.file_path + "-" +
                                     str(total_messages))

                    sink = pa.BufferOutputStream()
                    writer = pa.RecordBatchStreamWriter(sink, batch.schema)
                    writer.write_batch(batch)
                    writer.close()
                    self.messaging.publish_message(topic_name, key,
                                                   sink.getvalue())

                    self.avg_cell_size.append(
                        len(sink.getvalue().to_pybytes()) /
                        len(transformer.attr_name_list) / batch.num_rows)
                    total_messages += 1
                    self.messaging_timings.append(time.time() - messaging_tick)

        if self.object_store:
            object_store_tick = time.time()
            scratch_writer.close_scratch_file()

            print("Writing parquet to ", request_id, " as ",
                  transformer.file_path.replace('/', ':'))

            self.object_store.upload_file(
                request_id, transformer.file_path.replace('/', ':'),
                scratch_writer.file_path)

            scratch_writer.remove_scratch_file()
            self.object_store_timing = time.time() - object_store_tick

        tock = time.time()

        if self.messaging:
            avg_avg_cell_size = sum(self.avg_cell_size) / len(self.avg_cell_size) \
                if len(self.avg_cell_size) else 0

            print("Wrote " + str(total_messages) + " events  to " + topic_name,
                  "Avg Cell Size = " + str(avg_avg_cell_size) + " bytes")

        print("Real time: " + str(round(tock - tick / 60.0, 2)) + " minutes")
Esempio n. 10
0
    def test_dir_glob(self):
        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        with tempfile.TemporaryDirectory() as dirpath:
            mystore.address = dirpath + "/test"
            _path = Path(mystore.address)
            _path.mkdir()
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            path = dirpath + "/test/dummy2.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())

            objs_ = store.register_content(
                mystore.address,
                fileinfo,
                glob="*arrow",
                dataset_id=dataset.uuid,
                partition_key="key",
            )
            for obj_ in objs_:
                print(obj_.uuid, store[obj_.uuid].address)
                buf = pa.py_buffer(store.get(obj_.uuid))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            ds = store.list(suffix="dataset")
            for d in ds:
                p = d.uuid + ".part_key"
                f = store.list(prefix=p, suffix="arrow")
                print(f)
        print("Test Done ===========================")
Esempio n. 11
0
    def test_register_dataset(self):

        # Create a fake dataset
        # from a menu_id and menu msg
        # from a config_id and config msg
        # add files
        # add tables

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        # schema = batch.schema.to_pybytes()
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        fileinfo = FileObjectInfo()
        fileinfo.type = 5
        fileinfo.aux.num_columns = 3

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(str(_path), "test")
            store_id = store.store_uuid
            print(store.store_info.created.ToDatetime())

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            print(menu_uuid)
            print(config_uuid)
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            job_id = store.new_job(dataset.uuid)
            store.register_content(
                buf,
                fileinfo,
                dataset_id=dataset.uuid,
                partition_key="key",
                job_id=job_id,
            )

            ds = store.list(suffix="dataset")
            print(ds)
Esempio n. 12
0
    def test_identical_files(self):
        print("Testing add file from path")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(id_, store[id_].address)
            buf = pa.py_buffer(store._get_object(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)

            path = dirpath + "/test/dummy2.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(id_, store[id_].address)
            buf = pa.py_buffer(store.get(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)
        print("Test Done ===========================")
Esempio n. 13
0
 def time_convert_pandas_and_write_binary_table(self):
     out = pa.BufferOutputStream()
     pq.write_table(pa.table(self.table_df), out)
Esempio n. 14
0
 def time_write_binary_table_no_dictionary(self):
     out = pa.BufferOutputStream()
     pq.write_table(self.table, out, use_dictionary=False)
Esempio n. 15
0
def test_field_id_metadata():
    # ARROW-7080
    field_id = b'PARQUET:field_id'
    inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
    middle = pa.field('middle',
                      pa.struct([inner]),
                      metadata={field_id: b'101'})
    fields = [
        pa.field('basic',
                 pa.int32(),
                 metadata={
                     b'other': b'abc',
                     field_id: b'1'
                 }),
        pa.field('list',
                 pa.list_(
                     pa.field('list-inner',
                              pa.int32(),
                              metadata={field_id: b'10'})),
                 metadata={field_id: b'11'}),
        pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
        pa.field('no-metadata', pa.int32()),
        pa.field('non-integral-field-id',
                 pa.int32(),
                 metadata={field_id: b'xyz'}),
        pa.field('negative-field-id',
                 pa.int32(),
                 metadata={field_id: b'-1000'})
    ]
    arrs = [[] for _ in fields]
    table = pa.table(arrs, schema=pa.schema(fields))

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()

    pf = pq.ParquetFile(pa.BufferReader(contents))
    schema = pf.schema_arrow

    assert schema[0].metadata[field_id] == b'1'
    assert schema[0].metadata[b'other'] == b'abc'

    list_field = schema[1]
    assert list_field.metadata[field_id] == b'11'

    list_item_field = list_field.type.value_field
    assert list_item_field.metadata[field_id] == b'10'

    struct_field = schema[2]
    assert struct_field.metadata[field_id] == b'102'

    struct_middle_field = struct_field.type[0]
    assert struct_middle_field.metadata[field_id] == b'101'

    struct_inner_field = struct_middle_field.type[0]
    assert struct_inner_field.metadata[field_id] == b'100'

    assert schema[3].metadata is None
    # Invalid input is passed through (ok) but does not
    # have field_id in parquet (not tested)
    assert schema[4].metadata[field_id] == b'xyz'
    assert schema[5].metadata[field_id] == b'-1000'
Esempio n. 16
0
def test_query_indices_external(store, metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "file.parquest"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "file2.parquest"
                }
            },
        },
        "indices": {
            "product_id":
            "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put(
        "uuid+namespace-attribute12_underscored.by-dataset-metadata.json",
        simplejson.dumps(expected).encode("utf-8"),
    )
    df = pd.DataFrame({
        "product_id": [1, 2, 100, 34],
        "partition": [
            np.array(["part_1"], dtype=object),
            np.array(["part_2"], dtype=object),
            np.array(["part_1", "part_2"], dtype=object),
            np.array(["part_1"], dtype=object),
        ],
    })
    schema = pa.schema([
        pa.field("partition", pa.list_(pa.string())),
        pa.field("product_id", pa.int64()),
    ])
    table = pa.Table.from_pandas(df, schema=schema)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(
        "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
        buf.getvalue().to_pybytes(),
    )
    store_schema_metadata(
        make_meta(df, origin="core"),
        "uuid+namespace-attribute12_underscored",
        store,
        "core_data",
    )

    dmd = DatasetMetadata.load_from_store(
        "uuid+namespace-attribute12_underscored", store)

    dmd = dmd.load_index("product_id", store)
    assert dmd.query(product_id=2) == ["part_2"]
    dmd = dmd.load_all_indices(store)
    assert dmd.query(product_id=2, location_id=2) == ["part_2"]
    assert dmd.query(product_id=100, location_id=3) == ["part_1"]
    assert dmd.query(product_id=2, location_id=2,
                     something_else="bla") == ["part_2"]

    additional_index = ExplicitSecondaryIndex.from_v2(
        "another_column", {"1": ["part_2", "part_3"]})
    assert dmd.query(indices=[additional_index],
                     another_column="1",
                     product_id=2,
                     location_id=2) == ["part_2"]
Esempio n. 17
0
    buf = b''
    with pytest.raises(pa.ArrowInvalid):
        pa.ipc.open_file(pa.BufferReader(buf))


def test_file_simple_roundtrip(file_fixture):
    file_fixture._check_roundtrip(as_table=False)


def test_file_write_table(file_fixture):
    file_fixture._check_roundtrip(as_table=True)


@pytest.mark.parametrize("sink_factory", [
    lambda: io.BytesIO(),
    lambda: pa.BufferOutputStream()
])
def test_file_read_all(sink_factory):
    fixture = FileFormatFixture(sink_factory)

    batches = fixture.write_batches()
    file_contents = pa.BufferReader(fixture.get_source())

    reader = pa.ipc.open_file(file_contents)

    result = reader.read_all()
    expected = pa.Table.from_batches(batches)
    assert result.equals(expected)


def test_open_file_from_buffer(file_fixture):
Esempio n. 18
0
    def from_record_batches(
        cls,
        record_batches,
        output_types,
        output_shapes=None,
        columns=None,
        batch_size=None,
        batch_mode="keep_remainder",
    ):
        """Create an ArrowDataset directly from Arrow record batches.
        This constructor requires pyarrow to be installed.

        Args:
            record_batches: An Arrow record batch or sequence of record batches
            output_types: Tensor dtypes of the output tensors
            output_shapes: TensorShapes of the output tensors or None to
                            infer partial
            batch_size: Batch size of output tensors, setting a batch size here
                        will create batched tensors from Arrow memory and can be more
                        efficient than using tf.data.Dataset.batch().
                        NOTE: batch_size does not need to be set if batch_mode='auto'
            batch_mode: Mode of batching, supported strings:
                        "keep_remainder" (default, keeps partial batch data),
                        "drop_remainder" (discard partial batch data),
                        "auto" (size to number of records in Arrow record batch)
            columns: A list of column indices to be used in the Dataset
        """
        import pyarrow as pa  # pylint: disable=import-outside-toplevel

        if isinstance(record_batches, pa.RecordBatch):
            record_batches = [record_batches]
        if columns is None:
            columns = tuple(range(record_batches[0].num_columns))
        assert record_batches
        if tf.executing_eagerly():
            sink = pa.BufferOutputStream()
            writer = pa.RecordBatchFileWriter(sink, record_batches[0].schema)
            for batch in record_batches:
                writer.write_batch(batch)
            writer.close()
            serialized_batches = None
            arrow_buffer = sink.getvalue()
        else:
            buf = io.BytesIO()
            writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema)
            for batch in record_batches:
                writer.write_batch(batch)
            writer.close()
            serialized_batches = tf.convert_to_tensor(
                buf.getvalue(), dtype=dtypes.string, name="serialized_batches"
            )
            arrow_buffer = None
        return cls(
            serialized_batches,
            columns,
            output_types,
            output_shapes,
            batch_size=batch_size,
            batch_mode=batch_mode,
            arrow_buffer=arrow_buffer,
        )
Esempio n. 19
0
def test_buffer_readwrite_with_bad_writeoptions():
    from pyarrow import orc
    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    table = pa.table({"int64": a})

    # batch_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            batch_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            batch_size=-100,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            batch_size=1024.23,
        )

    # file_version must be 0.11 or 0.12
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            file_version=0.13,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            file_version='1.1',
        )

    # stripe_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            stripe_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            stripe_size=-400,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            stripe_size=4096.73,
        )

    # compression must be among the given options
    with pytest.raises(TypeError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression='none',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression='zlid',
        )

    # compression_block_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_block_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_block_size=-200,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_block_size=1096.73,
        )

    # compression_strategy must be among the given options
    with pytest.raises(TypeError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_strategy=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_strategy='no',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            compression_strategy='large',
        )

    # row_index_stride must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            row_index_stride=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            row_index_stride=-800,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            row_index_stride=3096.29,
        )

    # padding_tolerance must be possible to cast to float
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            padding_tolerance='cat',
        )

    # dictionary_key_size_threshold must be possible to cast to
    # float between 0.0 and 1.0
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            dictionary_key_size_threshold='arrow',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            dictionary_key_size_threshold=1.2,
        )
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            dictionary_key_size_threshold=-3.2,
        )

    # bloom_filter_columns must be convertible to a list containing
    # nonnegative integers
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_columns="string",
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_columns=[0, 1.4],
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_columns={0, 2, -1},
        )

    # bloom_filter_fpp must be convertible to a float between 0.0 and 1.0
    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_fpp='arrow',
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_fpp=1.1,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            table,
            buffer_output_stream,
            bloom_filter_fpp=-0.1,
        )
Esempio n. 20
0
def write_branches_to_arrow(messaging,
                            topic_name,
                            file_path,
                            file_id,
                            attr_name_list,
                            chunk_size,
                            server_endpoint,
                            event_limit=None,
                            object_store=None):

    scratch_writer = None

    event_iterator = XAODEvents(file_path, attr_name_list)
    transformer = XAODTransformer(event_iterator)

    batch_number = 0
    total_events = 0
    total_bytes = 0
    for pa_table in transformer.arrow_table(chunk_size, event_limit):
        if object_store:
            if not scratch_writer:
                scratch_writer = _open_scratch_file(args.result_format,
                                                    pa_table)
            _append_table_to_scratch(args.result_format, scratch_writer,
                                     pa_table)

        total_events = total_events + pa_table.num_rows
        batches = pa_table.to_batches(chunksize=chunk_size)

        for batch in batches:
            if messaging:
                key = file_path + "-" + str(batch_number)

                sink = pa.BufferOutputStream()
                writer = pa.RecordBatchStreamWriter(sink, batch.schema)
                writer.write_batch(batch)
                writer.close()
                messaging.publish_message(topic_name, key, sink.getvalue())

                total_bytes = total_bytes + len(sink.getvalue().to_pybytes())

                avg_cell_size = len(sink.getvalue().to_pybytes()) / len(
                    attr_name_list) / batch.num_rows
                print(
                    "Batch number " + str(batch_number) + ", " +
                    str(batch.num_rows) + " events published to " + topic_name,
                    "Avg Cell Size = " + str(avg_cell_size) + " bytes")
                batch_number += 1

    if object_store:
        _close_scratch_file(args.result_format, scratch_writer)
        print("Writing parquet to ", args.request_id, " as ",
              file_path.replace('/', ':'))
        object_store.upload_file(args.request_id, file_path.replace('/', ':'),
                                 "/tmp/out")
        os.remove("/tmp/out")

    print("===> Total Events ", total_events)
    print("===> Total Bytes ", total_bytes)

    if server_endpoint:
        post_status_update(server_endpoint, "File " + file_path + " complete")

    put_file_complete(server_endpoint,
                      file_path,
                      file_id,
                      "success",
                      num_messages=batch_number,
                      total_time="??",
                      total_events=total_events,
                      total_bytes=total_bytes)
Esempio n. 21
0
    def enqueue(self, uri, **data):
        sink = pa.BufferOutputStream()
        field_list = []
        data_list = []
        for key, value in data.items():

            if isinstance(value, str):
                # str value will be considered as image path
                field = pa.field(key, pa.string())
                data = self.encode_image(value)
                # b = bytes(data, "utf-8")
                data = pa.array([data])
                # ba = pa.array(b, type=pa.binary())
                field_list.append(field)
                data_list.append(data)

            elif isinstance(value, np.ndarray):
                # ndarray value will be considered as tensor
                indices_field = pa.field("indiceData", pa.list_(pa.int32()))
                indices_shape_field = pa.field("indiceShape",
                                               pa.list_(pa.int32()))
                data_field = pa.field("data", pa.list_(pa.float32()))
                shape_field = pa.field("shape", pa.list_(pa.int32()))
                tensor_type = pa.struct([
                    indices_field, indices_shape_field, data_field, shape_field
                ])
                field = pa.field(key, tensor_type)

                shape = np.array(value.shape)
                d = value.astype("float32").flatten()
                # data = pa.array([{'data': d}, {'shape': shape}, {}],
                #                 type=tensor_type)
                data = pa.array([{
                    'indiceData': []
                }, {
                    'indiceShape': []
                }, {
                    'data': d
                }, {
                    'shape': shape
                }],
                                type=tensor_type)
                field_list.append(field)
                data_list.append(data)

            elif isinstance(value, list):
                # list will be considered as sparse tensor
                assert len(value) == 3, "Sparse Tensor must have list of ndarray" \
                    "with length 3, which represent indices, values, shape respectively"
                indices_field = pa.field("indiceData", pa.list_(pa.int32()))
                indices_shape_field = pa.field("indiceShape",
                                               pa.list_(pa.int32()))
                value_field = pa.field("data", pa.list_(pa.float32()))
                shape_field = pa.field("shape", pa.list_(pa.int32()))
                sparse_tensor_type = pa.struct([
                    indices_field, indices_shape_field, value_field,
                    shape_field
                ])
                field = pa.field(key, sparse_tensor_type)

                shape = value[2]
                values = value[1]
                indices = value[0].astype("float32").flatten()
                indices_shape = value[0].shape
                data = pa.array([{
                    'indiceData': indices
                }, {
                    'indiceShape': indices_shape
                }, {
                    'data': values
                }, {
                    'shape': shape
                }],
                                type=sparse_tensor_type)
                field_list.append(field)
                data_list.append(data)
            else:
                raise TypeError("Your request does not match any schema, "
                                "please check.")
        schema = pa.schema(field_list)
        batch = pa.RecordBatch.from_arrays(data_list, schema)

        writer = pa.RecordBatchStreamWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()
        buf = sink.getvalue()
        b = buf.to_pybytes()
        b64str = self.base64_encode_image(b)
        d = {"uri": uri, "data": b64str}
        self.__enqueue_data(d)
Esempio n. 22
0
def _serialize(data: Any, ) -> Tuple[bytes, Serialization]:
    """Serializes an object to a ``pa.Buffer``.

    The way the object is serialized depends on the nature of the
    object: ``pa.RecordBatch`` and ``pa.Table`` are serialized using
    ``pyarrow`` functions. All other cases are serialized through the
    ``pickle`` library.

    Args:
        data: The object/data to be serialized.

    Returns:
        Tuple of the serialized data (in ``pa.Buffer`` format) and the
        :class:`Serialization` that was used.

    Raises:
        SerializationError: If the data could not be serialized.

    Note:
        ``pickle`` does not include the code of custom functions or
        classes, it only pickles their names. Following to the official
        `Python Docs
        <https://docs.python.org/3/library/pickle.html#what-can-be-pickled-and-unpickled>`_:
        "Thus the defining module must be importable in the unpickling
        environment, and the module must contain the named object,
        otherwise an exception will be raised."

    """
    if isinstance(data, (pa.RecordBatch, pa.Table)):
        # Use the intended pyarrow functionalities when possible.
        if isinstance(data, pa.Table):
            serialization = Serialization.ARROW_TABLE
        else:
            serialization = Serialization.ARROW_BATCH

        output_buffer = pa.BufferOutputStream()
        try:
            writer = pa.RecordBatchStreamWriter(output_buffer, data.schema)
            writer.write(data)
            writer.close()
        except pa.ArrowSerializationError:
            raise error.SerializationError(
                f"Could not serialize data of type {type(data)}.")

        serialized = output_buffer.getvalue()

    else:
        # All other cases use the pickle library.
        serialization = Serialization.PICKLE

        # Use the best protocol possible, for reference see:
        # https://docs.python.org/3/library/pickle.html#pickle-protocols
        try:
            serialized = pickle.dumps(data, pickle.HIGHEST_PROTOCOL)
        except pickle.PicklingError:
            raise error.SerializationError(
                f"Could not pickle data of type {type(data)}.")

        # NOTE: zero-copy view on the bytes.
        serialized = pa.py_buffer(serialized)

    return serialized, serialization
Esempio n. 23
0
def test_native_file_write_reject_unicode():
    # ARROW-3227
    nf = pa.BufferOutputStream()
    with pytest.raises(TypeError):
        nf.write(u'foo')
    def map(
        self,
        function,
        with_indices: bool = False,
        batched: bool = False,
        batch_size: Optional[int] = 1000,
        remove_columns: Optional[List[str]] = None,
        keep_in_memory: bool = False,
        load_from_cache_file: bool = True,
        cache_file_name: Optional[str] = None,
        writer_batch_size: Optional[int] = 1000,
        arrow_schema: Optional[pa.Schema] = None,
        disable_nullable: bool = True,
    ):
        """ Apply a function to all the elements in the table (individually or in batches)
            and update the table (if function does updated examples).

            Args:
                `function` (`callable`): with one of the following signature:
                    - `function(example: Dict) -> Union[Dict, Any]` if `batched=False` and `with_indices=False`
                    - `function(example: Dict, indices: int) -> Union[Dict, Any]` if `batched=False` and `with_indices=True`
                    - `function(batch: Dict[List]) -> Union[Dict, Any]` if `batched=True` and `with_indices=False`
                    - `function(batch: Dict[List], indices: List[int]) -> Union[Dict, Any]` if `batched=True` and `with_indices=True`
                `with_indices` (`bool`, default: `False`): Provide example indices to `function`
                `batched` (`bool`, default: `False`): Provide batch of examples to `function`
                `batch_size` (`Optional[int]`, default: `1000`): Number of examples per batch provided to `function` if `batched=True`
                    `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function`
                `remove_columns` (`Optional[List[str]]`, default: `None`): Remove a selection of columns while doing the mapping.
                    Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                    columns with names in `remove_columns`, these columns will be kept.
                `keep_in_memory` (`bool`, default: `False`): Keep the dataset in memory instead of writing it to a cache file.
                `load_from_cache_file` (`bool`, default: `True`): If a cache file storing the current computation from `function`
                    can be identified, use it instead of recomputing.
                `cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a cache file to use to store the
                    results of the computation instead of the automatically generated cache file name.
                `writer_batch_size` (`int`, default: `1000`): Number of rows per write operation for the cache file writer.
                    Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`.
                `arrow_schema` (`Optional[pa.Schema]`, default: `None`): Use a specific Apache Arrow Schema to store the cache file
                    instead of the automatically generated one.
                `disable_nullable` (`bool`, default: `True`): Allow null values in the table.
        """
        # If the array is empty we do nothing
        if len(self) == 0:
            return self

        # Select the columns (arrow columns) to process
        if remove_columns is not None and any(
                col not in self._data.column_names for col in remove_columns):
            raise ValueError(
                "Column to remove {} not in the dataset. Current columns in the dataset: {}"
                .format(
                    list(
                        filter(lambda col: col not in self._data.column_names,
                               remove_columns)),
                    self._data.column_names,
                ))

        # If we do batch computation but no batch sze is provided, default to the full dataset
        if batched and (batch_size is None or batch_size <= 0):
            batch_size = self._data.num_rows

        # Check if the function returns updated examples
        def does_function_return_dict(inputs, indices):
            """ Does the function returns a dict. """
            processed_inputs = function(
                inputs, indices) if with_indices else function(inputs)
            does_return_dict = isinstance(processed_inputs, Mapping)

            if does_return_dict is False and processed_inputs is not None:
                raise TypeError(
                    "Provided `function` which is applied to all elements of table returns a variable of type {}. Make sure provided `function` returns a variable of type `dict` to update the dataset or `None` if you are only interested in side effects."
                    .format(type(processed_inputs)))
            elif isinstance(test_indices, list) and does_return_dict is True:
                all_dict_values_are_lists = all(
                    isinstance(value, list)
                    for value in processed_inputs.values())
                if all_dict_values_are_lists is False:
                    raise TypeError(
                        "Provided `function` which is applied to all elements of table returns a `dict` of types {}. When using `batched=True`, make sure provided `function` returns a `dict` of types `list`."
                        .format([type(x) for x in processed_inputs.values()]))

            return does_return_dict

        # We only update the data table (and use the cache) if the function returns a dict.
        # Test it on the first element or a small batch (0, 1) for batched inputs
        test_inputs = self[:2] if batched else self[0]
        test_indices = [0, 1] if batched else 0
        update_data = does_function_return_dict(test_inputs, test_indices)

        def apply_function_on_filtered_inputs(inputs, indices):
            """ Utility to apply the function on a selection of columns. """
            processed_inputs = function(
                inputs, indices) if with_indices else function(inputs)
            if not update_data:
                return None  # Nothing to update, let's move on
            if remove_columns is not None:
                for column in remove_columns:
                    inputs.pop(column)
            if self._format_type is not None:
                inputs = self._getitem(
                    key=(indices if isinstance(indices, int) else slice(
                        indices[0], indices[-1])),
                    format_type=None,
                    format_columns=None,
                )
            inputs.update(processed_inputs)
            return inputs

        # Find the output schema if none is given
        test_inputs = self[:2] if batched else self[0]
        test_indices = [0, 1] if batched else 0
        test_output = apply_function_on_filtered_inputs(
            test_inputs, test_indices)
        if arrow_schema is None and update_data:
            if not batched:
                test_output = self._nest(test_output)
            test_output = convert_tuples_in_lists(test_output)
            arrow_schema = pa.Table.from_pydict(test_output).schema
            if disable_nullable:
                arrow_schema = pa.schema(
                    pa.field(field.name, field.type, nullable=False)
                    for field in arrow_schema)

        # Check if we've already cached this computation (indexed by a hash)
        if self._data_files and update_data:
            if cache_file_name is None:
                # we create a unique hash from the function, current dataset file and the mapping args
                cache_kwargs = {
                    "with_indices": with_indices,
                    "batched": batched,
                    "batch_size": batch_size,
                    "remove_columns": remove_columns,
                    "keep_in_memory": keep_in_memory,
                    "load_from_cache_file": load_from_cache_file,
                    "cache_file_name": cache_file_name,
                    "writer_batch_size": writer_batch_size,
                    "arrow_schema": arrow_schema,
                    "disable_nullable": disable_nullable,
                }
                cache_file_name = self._get_cache_file_path(
                    function, cache_kwargs)
            if os.path.exists(cache_file_name) and load_from_cache_file:
                logger.info("Loading cached processed dataset at %s",
                            cache_file_name)
                return Dataset.from_file(cache_file_name)

        # Prepare output buffer and batched writer in memory or on file if we update the table
        if update_data:
            if keep_in_memory or not self._data_files:
                buf_writer = pa.BufferOutputStream()
                writer = ArrowWriter(schema=arrow_schema,
                                     stream=buf_writer,
                                     writer_batch_size=writer_batch_size)
            else:
                buf_writer = None
                logger.info("Caching processed dataset at %s", cache_file_name)
                writer = ArrowWriter(schema=arrow_schema,
                                     path=cache_file_name,
                                     writer_batch_size=writer_batch_size)

        # Loop over single examples or batches and write to buffer/file if examples are to be updated
        if not batched:
            for i, example in tqdm(enumerate(self)):
                example = apply_function_on_filtered_inputs(example, i)
                if update_data:
                    writer.write(example)
        else:
            for i in tqdm(range(0, len(self), batch_size)):
                batch = self[i:i + batch_size]
                indices = list(
                    range(*(slice(i, i +
                                  batch_size).indices(self._data.num_rows)
                            )))  # Something simpler?
                batch = apply_function_on_filtered_inputs(batch, indices)
                if update_data:
                    writer.write_batch(batch)

        if update_data:
            writer.finalize(
            )  # close_stream=bool(buf_writer is None))  # We only close if we are writing in a file

            # Create new Dataset from buffer or file
            if buf_writer is None:
                return Dataset.from_file(cache_file_name)
            else:
                return Dataset.from_buffer(buf_writer.getvalue())
        else:
            return self
Esempio n. 25
0
 def open_append_stream(self, path):
     if "notfound" in path:
         raise FileNotFoundError(path)
     return pa.BufferOutputStream()
Esempio n. 26
0
 def __init__(self):
     self.buf = pa.BufferOutputStream()
Esempio n. 27
0
 def _get_sink(self):
     return pa.BufferOutputStream()
def ipc_write_batch(batch):
    stream = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
    writer.write_batch(batch)
    writer.close()
    return stream.getvalue()
Esempio n. 29
0
def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs):
    stream = pa.BufferOutputStream()
    _write_table(table, stream, **write_kwargs)
    buf = stream.getvalue()
    return _read_table(buf, use_legacy_dataset=use_legacy_dataset)
Esempio n. 30
0
 def time_write_binary_table(self):
     out = pa.BufferOutputStream()
     pq.write_table(self.table, out)