Example #1
0
    def load_cache(cache_path) -> "DiskBackedDataset":
        """
        Loads a cached DiskBackedDataset contained in the cache_path directory.
        Fields will be loaded into memory but the Example data will be memory
        mapped avoiding unnecessary memory usage.

        Parameters
        ----------
        cache_path: Optional[str]
            Path to the directory where the cache file will be saved.
            The whole directory will be used as the cache and will be deleted
            when `delete_cache` is called. It is recommended to create a new
            directory to use exclusively as the cache, or to leave this as None.

            If None, a temporary directory will be created.

        Returns
        -------
        DiskBackedDataset
            the DiskBackedDataset loaded from the passed cache directory.
        """
        # load fields
        fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
        with open(fields_file_path, "rb") as fields_cache_file:
            fields = pickle.load(fields_cache_file)

        # load dataset as memory mapped arrow table
        table_file_path = os.path.join(cache_path, CACHE_TABLE_FILENAME)
        mmapped_file = pa.memory_map(table_file_path)
        table = pa.RecordBatchFileReader(mmapped_file).read_all()
        return DiskBackedDataset(table, fields, cache_path, mmapped_file)
def decompress(pybytes: bytes) -> pa.Array:
    buf = io.BytesIO()
    buf.write(brotli.decompress(pybytes))
    buf.seek(0)
    reader = pa.RecordBatchFileReader(buf)
    rb = reader.get_batch(0)
    return rb.column(0)
Example #3
0
def restore_topic(args: argparse.Namespace, archive: str) -> None:

    topic = os.path.splitext(archive)[0]
    log.info(f"Restoring messages to topic {topic}")

    producer = kafka.KafkaProducer(
        bootstrap_servers=[f"{args.kafkahost}:{args.port}"], retries=3
    )

    def on_error(excp: Exception) -> None:
        log.error(f"ERROR: Failed to send message {excp}")
        sys.exit(1)

    local = pyarrow.fs.LocalFileSystem()

    with local.open_input_file(f"{topic}.arrow") as f:
        with pyarrow.RecordBatchFileReader(f) as reader:
            table = reader.read_all()

    for i in range(0, table.num_rows):
        key = table["key"][i].as_py()
        value = table["value"][i].as_py()
        timestamp = table["timestamp"][i].value

        producer.send(
            f"{topic}", key=key, value=value, timestamp_ms=timestamp
        ).add_errback(on_error)

    producer.flush()
    log.info(f"Restored {table.num_rows} rows to topic {topic}")
Example #4
0
 def read(self, schema):
   address = self.reader.readArrowBatchAddress(schema)
   size = (ctypes.c_int32).from_address(address).value
   arrowData = (ctypes.c_byte * size).from_address(address + 4)
   rawData = bytes(arrowData)
   self.reader.freeArrowBatchMemory(address)
   reader = pa.RecordBatchFileReader(pa.BufferReader(rawData))
   data = reader.read_all()
   return data
    def getArrowSchema(self):
        if self.url_path.scheme == 's3a':
            buf = CarbonSchemaReader().readSchema(
                self.path, True, self.configuration.conf).tostring()
        else:
            buf = CarbonSchemaReader().readSchema(self.path, True).tostring()

        reader = pa.RecordBatchFileReader(pa.BufferReader(bytes(buf)))
        return reader.read_all().schema
Example #6
0
def arrowfile_to_numpy(file_format, column_index):
    ''' Arrow file format to a numpy ndarray or a list of numpy ndarray
    '''
    df = pa.RecordBatchFileReader(
        pa.BufferReader(file_format)).read_all().to_pandas()

    if isinstance(column_index, str):
        return df[column_index].values
    else:
        return [df[k].values for k in column_index]
Example #7
0
def print_archive(args: argparse.Namespace) -> None:
    """Print the decoded representation of messages in an archive file."""

    # Why not 'printer = pprint.pprint if args.pretty_print else print'?
    # Because mypy exits with 'error: Cannot call function of unknown type'
    def printer(obj: typing.Any) -> None:
        if args.pretty_print:
            pprint.pprint(obj)
        else:
            print(json.dumps(obj))

    topic = args.topic

    with open("schemas.json") as fd:
        schemas = json.load(fd)

    try:
        key_schema = schemas[f"{topic}-key"]["schema"]
        value_schema = schemas[f"{topic}-value"]["schema"]
        if args.print_schemas:
            printer(json.loads(key_schema))
            printer(json.loads(value_schema))
    except KeyError as e:
        log.error("Failed to locate schema for topic: %s", e)
        raise

    key_reader = avro.io.DatumReader(avro.schema.parse(key_schema))
    value_reader = avro.io.DatumReader(avro.schema.parse(value_schema))

    arrow_file = pathlib.Path(f"{topic}.arrow")
    if not arrow_file.exists():
        log.error(f"No arrow file for topic {topic}")
        sys.exit(1)

    local = pyarrow.fs.LocalFileSystem()
    with local.open_input_file(f"{topic}.arrow") as f:
        with pyarrow.RecordBatchFileReader(f) as reader:
            table = reader.read_all()

    for i in range(args.offset, min(table.num_rows, args.offset + args.count)):
        key = table["key"][i].as_py()
        value = table["value"][i].as_py()
        timestamp = table["timestamp"][i].value

        # Strip the first 5 bytes, as they are added by Confluent Platform and are not part of the
        # actual serialized data
        decoded = {
            "key": decode(key_reader, key[5:]) if key else None,
            "timestamp": timestamp,
            "value": decode(value_reader, value[5:]) if value else None,
        }

        printer(decoded)
Example #8
0
def summarize_archive(archive: str) -> None:
    """Print interesting statistics about an archived Kafka topic."""

    with open("schemas.json") as fd:
        schemas = json.load(fd)

    topic = os.path.splitext(archive)[0]
    try:
        key_schema = schemas[f"{topic}-key"]["schema"]
        value_schema = schemas[f"{topic}-value"]["schema"]
    except KeyError as e:
        log.error("Failed to locate schema for topic %s", e)
        raise

    key_reader = avro.io.DatumReader(avro.schema.parse(key_schema))
    value_reader = avro.io.DatumReader(avro.schema.parse(value_schema))

    local = pyarrow.fs.LocalFileSystem()
    with local.open_input_file(archive) as f:
        with pyarrow.RecordBatchFileReader(f) as reader:
            table = reader.read_all()

    num_messages = table.num_rows
    key_bytes = 0
    value_bytes = 0
    num_ops: typing.DefaultDict[str, int] = collections.defaultdict(int)

    for i in range(0, table.num_rows):
        key = table["key"][i].as_py()
        value = table["value"][i].as_py()
        timestamp = table["timestamp"][i].value

        if key:
            key_bytes += len(key)
        if value:
            value_bytes += len(value)

        # Strip the first 5 bytes, as they are added by Confluent Platform and are not part of the
        # actual serialized data
        decoded = {
            "key": decode(key_reader, key[5:]),
            "timestamp": timestamp,
            "value": decode(value_reader, value[5:]),
        }

        op = decoded["value"]["op"]
        num_ops[op] += 1

    print(
        f"{topic},{num_messages},{key_bytes},{value_bytes},{num_ops['c']},{num_ops['u']},{num_ops['d']}"
    )
Example #9
0
def test_write_empty_ipc_file():
    # ARROW-3894: IPC file was not being properly initialized when no record
    # batches are being written
    schema = pa.schema([('field', pa.int64())])

    sink = pa.BufferOutputStream()
    with pa.ipc.new_file(sink, schema):
        pass

    buf = sink.getvalue()
    with pa.RecordBatchFileReader(pa.BufferReader(buf)) as reader:
        table = reader.read_all()
    assert len(table) == 0
    assert table.schema.equals(schema)
Example #10
0
def test_open_file_from_buffer(file_fixture):
    # ARROW-2859; APIs accept the buffer protocol
    _, batches = file_fixture.write_batches()
    source = file_fixture.get_source()

    reader1 = pa.open_file(source)
    reader2 = pa.open_file(pa.BufferReader(source))
    reader3 = pa.RecordBatchFileReader(source)

    result1 = reader1.read_all()
    result2 = reader2.read_all()
    result3 = reader3.read_all()

    assert result1.equals(result2)
    assert result1.equals(result3)
Example #11
0
 def loads(self, obj):
     """
     Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series
     followed by a dictionary containing length of the loaded batches.
     """
     import pyarrow as pa
     reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
     batches = [
         reader.get_batch(i) for i in xrange(reader.num_record_batches)
     ]
     # NOTE: a 0-parameter pandas_udf will produce an empty batch that can have num_rows set
     num_rows = sum((batch.num_rows for batch in batches))
     table = pa.Table.from_batches(batches)
     return [c.to_pandas()
             for c in table.itercolumns()] + [{
                 "length": num_rows
             }]
Example #12
0
def deserialize_pandas(buf, nthreads=1):
    """Deserialize a buffer protocol compatible object into a pandas DataFrame.

    Parameters
    ----------
    buf : buffer
        An object compatible with the buffer protocol
    nthreads : int, optional
        The number of threads to use to convert the buffer to a DataFrame.

    Returns
    -------
    df : pandas.DataFrame
    """
    buffer_reader = pa.BufferReader(buf)
    reader = pa.RecordBatchFileReader(buffer_reader)
    table = reader.read_all()
    return table.to_pandas(nthreads=nthreads)
Example #13
0
def test_open_file_from_buffer(file_fixture):
    # ARROW-2859; APIs accept the buffer protocol
    file_fixture.write_batches()
    source = file_fixture.get_source()

    reader1 = pa.ipc.open_file(source)
    reader2 = pa.ipc.open_file(pa.BufferReader(source))
    reader3 = pa.RecordBatchFileReader(source)

    result1 = reader1.read_all()
    result2 = reader2.read_all()
    result3 = reader3.read_all()

    assert result1.equals(result2)
    assert result1.equals(result3)

    st1 = reader1.stats
    assert st1.num_messages == 6
    assert st1.num_record_batches == 5
    assert reader2.stats == st1
    assert reader3.stats == st1
Example #14
0
    def from_examples(
        fields: Union[Dict[str, Field], List[Field], Tuple[Field]],
        examples: Iterable[Example],
        cache_path: Optional[str] = None,
        data_types: Dict[str, Tuple[pa.DataType, pa.DataType]] = None,
        chunk_size=1024,
    ) -> "DiskBackedDataset":
        """
        Creates a DiskBackedDataset from the provided Examples.

        Parameters
        ----------
        fields: Union[Dict[str, Field], List[Field]]
            Dict or List of Fields used to create the Examples.

        examples: Iterable[Example]
            Iterable of examples.

        cache_path: Optional[str]
            Path to the directory where the cache file will saved.
            The whole directory will be used as the cache and will be deleted
            when `delete_cache` is called. It is recommended to create a new
            directory to use exclusively as the cache, or to leave this as None.

            If None, a temporary directory will be created.

        data_types: Dict[str, Tuple[pyarrow.DataType, pyarrow.DataType]]
            Dictionary mapping field names to pyarrow data types. This is required when a
            field can have missing data and the data type can't be inferred. The data type
            tuple has two values, corresponding to the raw and tokenized data types in an
            example. None can be used as a wildcard data type and will be overridden by an
            inferred data type if possible.

        chunk_size: int
            Maximum number of examples to be loaded before dumping to the on-disk cache
            file. Use lower number if memory usage is an issue while loading.

        Returns
        -------
        DiskBackedDataset
            DiskBackedDataset instance created from the passed Examples.
        """

        fields = unpack_fields(fields)

        if cache_path is None:
            cache_path = tempfile.mkdtemp(prefix=TEMP_CACHE_FILENAME_PREFIX)

        # dump dataset table
        cache_table_path = os.path.join(cache_path, CACHE_TABLE_FILENAME)

        # TODO hande cache case when cache is present

        chunks_iter = _chunkify(examples, chunk_size)

        # get first chunk to infer schema
        first_chunk = next(chunks_iter)
        record_batch = DiskBackedDataset._examples_to_recordbatch(
            first_chunk, fields, data_types
        )
        inferred_data_types = DiskBackedDataset._schema_to_data_types(record_batch.schema)

        # check for missing data types in inferred schema
        DiskBackedDataset._check_for_missing_data_types(fields, inferred_data_types)

        # write cache file to disk
        with pa.OSFile(cache_table_path, "wb") as f:
            with pa.RecordBatchFileWriter(f, schema=record_batch.schema) as writer:
                writer.write(record_batch)  # write first chunk
                for examples_chunk in chunks_iter:  # write rest of chunks
                    record_batch = DiskBackedDataset._examples_to_recordbatch(
                        examples_chunk, fields, inferred_data_types
                    )
                    writer.write(record_batch)

        mmapped_file = pa.memory_map(cache_table_path)
        table = pa.RecordBatchFileReader(mmapped_file).read_all()

        return DiskBackedDataset(
            table, fields, cache_path, mmapped_file, inferred_data_types
        )
Example #15
0
 def read_table(filename):
     filename = filename.numpy().decode("utf-8")
     reader = pa.RecordBatchFileReader(filename)
     return reader.read_all()
Example #16
0
#!/usr/bin/env python

import pyarrow as pa

with pa.OSFile("/tmp/filtered.arrow") as source:
    writer = pa.RecordBatchFileReader(source)
    print(writer.get_record_batch(0).to_pandas())
Example #17
0
 def from_file(_):
     reader = pa.RecordBatchFileReader(f.name)
     t = reader.read_all()
     tio = IOTensor.from_arrow(t, spec=spec)
     return tio(column).to_tensor()
Example #18
0
# Stats schema
stats_schema = pa.schema([pa.field('stats', pa.uint64(),
                                   nullable=False)]).add_metadata({
                                       b'fletcher_mode':
                                       b'write',
                                       b'fletcher_name':
                                       b'Stats'
                                   })

pa.output_stream('stats.as').write(stats_schema.serialize())

# If a recordbatch is provided as test case input, trim it and pass it to
# fletchgen instead of the schema.
if len(sys.argv) > 1:
    with open(sys.argv[1], 'rb') as fil:
        tab = pa.RecordBatchFileReader(fil).read_all()
    if len(sys.argv) > 2:
        tab = tab.slice(0, int(sys.argv[2]))
    with open('pages.rb', 'wb') as fil:
        with pa.RecordBatchFileWriter(fil, pages_schema) as writer:
            writer.write_table(tab)
    pages_args = ['-r', 'pages.rb', '-s', 'vhdl/memory.srec']
else:
    pages_args = ['-i', 'pages.as']

# Run fletchgen
subprocess.run(
    ['fletchgen'] + pages_args +
    ['-i', 'result.as', '-i', 'stats.as', '-n', 'WordMatch', '--sim', '--axi'])
Example #19
0
def load_pyarrow_table(filePath):
    with pa.OSFile(filePath, 'rb') as f:
        reader = pa.RecordBatchFileReader(f)
        return reader.read_all()
Example #20
0
def arrowfile_to_dataframe(file_format):
    ''' Arrow file format to Pandas DataFrame
    '''
    return pa.RecordBatchFileReader(
        pa.BufferReader(file_format)).read_all().to_pandas()
Example #21
0
# limitations under the License.

import pyarrow as pa
import pyfletcher as pf
import numpy as np
import timeit
import sys
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("recordbatch_path")
    args = parser.parse_args()

    # Set up a RecordBatch reader and read the RecordBatch.
    reader = pa.RecordBatchFileReader(args.recordbatch_path)
    batch = reader.get_batch(0)

    platform = pf.Platform(
    )  # Create an interface to an auto-detected FPGA Platform.
    platform.init()  # Initialize the Platform.

    context = pf.Context(
        platform)  # Create a Context for our data on the Platform.
    context.queue_record_batch(batch)  # Queue the RecordBatch to the Context.
    context.enable(
    )  # Enable the Context, (potentially transferring the data to FPGA).

    kernel = pf.Kernel(
        context)  # Set up an interface to the Kernel, supplying the Context.
    kernel.start()  # Start the kernel.
Example #22
0
 def loads(self, obj):
     import pyarrow as pa
     reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
     return reader.read_all()
Example #23
0
def arrow_data_collection(path):
    with pa.RecordBatchFileReader(path) as reader:
        batch = reader.get_batch(0)
    #print(batch.to_pandas())
    return batch