def load_cache(cache_path) -> "DiskBackedDataset": """ Loads a cached DiskBackedDataset contained in the cache_path directory. Fields will be loaded into memory but the Example data will be memory mapped avoiding unnecessary memory usage. Parameters ---------- cache_path: Optional[str] Path to the directory where the cache file will be saved. The whole directory will be used as the cache and will be deleted when `delete_cache` is called. It is recommended to create a new directory to use exclusively as the cache, or to leave this as None. If None, a temporary directory will be created. Returns ------- DiskBackedDataset the DiskBackedDataset loaded from the passed cache directory. """ # load fields fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) with open(fields_file_path, "rb") as fields_cache_file: fields = pickle.load(fields_cache_file) # load dataset as memory mapped arrow table table_file_path = os.path.join(cache_path, CACHE_TABLE_FILENAME) mmapped_file = pa.memory_map(table_file_path) table = pa.RecordBatchFileReader(mmapped_file).read_all() return DiskBackedDataset(table, fields, cache_path, mmapped_file)
def decompress(pybytes: bytes) -> pa.Array: buf = io.BytesIO() buf.write(brotli.decompress(pybytes)) buf.seek(0) reader = pa.RecordBatchFileReader(buf) rb = reader.get_batch(0) return rb.column(0)
def restore_topic(args: argparse.Namespace, archive: str) -> None: topic = os.path.splitext(archive)[0] log.info(f"Restoring messages to topic {topic}") producer = kafka.KafkaProducer( bootstrap_servers=[f"{args.kafkahost}:{args.port}"], retries=3 ) def on_error(excp: Exception) -> None: log.error(f"ERROR: Failed to send message {excp}") sys.exit(1) local = pyarrow.fs.LocalFileSystem() with local.open_input_file(f"{topic}.arrow") as f: with pyarrow.RecordBatchFileReader(f) as reader: table = reader.read_all() for i in range(0, table.num_rows): key = table["key"][i].as_py() value = table["value"][i].as_py() timestamp = table["timestamp"][i].value producer.send( f"{topic}", key=key, value=value, timestamp_ms=timestamp ).add_errback(on_error) producer.flush() log.info(f"Restored {table.num_rows} rows to topic {topic}")
def read(self, schema): address = self.reader.readArrowBatchAddress(schema) size = (ctypes.c_int32).from_address(address).value arrowData = (ctypes.c_byte * size).from_address(address + 4) rawData = bytes(arrowData) self.reader.freeArrowBatchMemory(address) reader = pa.RecordBatchFileReader(pa.BufferReader(rawData)) data = reader.read_all() return data
def getArrowSchema(self): if self.url_path.scheme == 's3a': buf = CarbonSchemaReader().readSchema( self.path, True, self.configuration.conf).tostring() else: buf = CarbonSchemaReader().readSchema(self.path, True).tostring() reader = pa.RecordBatchFileReader(pa.BufferReader(bytes(buf))) return reader.read_all().schema
def arrowfile_to_numpy(file_format, column_index): ''' Arrow file format to a numpy ndarray or a list of numpy ndarray ''' df = pa.RecordBatchFileReader( pa.BufferReader(file_format)).read_all().to_pandas() if isinstance(column_index, str): return df[column_index].values else: return [df[k].values for k in column_index]
def print_archive(args: argparse.Namespace) -> None: """Print the decoded representation of messages in an archive file.""" # Why not 'printer = pprint.pprint if args.pretty_print else print'? # Because mypy exits with 'error: Cannot call function of unknown type' def printer(obj: typing.Any) -> None: if args.pretty_print: pprint.pprint(obj) else: print(json.dumps(obj)) topic = args.topic with open("schemas.json") as fd: schemas = json.load(fd) try: key_schema = schemas[f"{topic}-key"]["schema"] value_schema = schemas[f"{topic}-value"]["schema"] if args.print_schemas: printer(json.loads(key_schema)) printer(json.loads(value_schema)) except KeyError as e: log.error("Failed to locate schema for topic: %s", e) raise key_reader = avro.io.DatumReader(avro.schema.parse(key_schema)) value_reader = avro.io.DatumReader(avro.schema.parse(value_schema)) arrow_file = pathlib.Path(f"{topic}.arrow") if not arrow_file.exists(): log.error(f"No arrow file for topic {topic}") sys.exit(1) local = pyarrow.fs.LocalFileSystem() with local.open_input_file(f"{topic}.arrow") as f: with pyarrow.RecordBatchFileReader(f) as reader: table = reader.read_all() for i in range(args.offset, min(table.num_rows, args.offset + args.count)): key = table["key"][i].as_py() value = table["value"][i].as_py() timestamp = table["timestamp"][i].value # Strip the first 5 bytes, as they are added by Confluent Platform and are not part of the # actual serialized data decoded = { "key": decode(key_reader, key[5:]) if key else None, "timestamp": timestamp, "value": decode(value_reader, value[5:]) if value else None, } printer(decoded)
def summarize_archive(archive: str) -> None: """Print interesting statistics about an archived Kafka topic.""" with open("schemas.json") as fd: schemas = json.load(fd) topic = os.path.splitext(archive)[0] try: key_schema = schemas[f"{topic}-key"]["schema"] value_schema = schemas[f"{topic}-value"]["schema"] except KeyError as e: log.error("Failed to locate schema for topic %s", e) raise key_reader = avro.io.DatumReader(avro.schema.parse(key_schema)) value_reader = avro.io.DatumReader(avro.schema.parse(value_schema)) local = pyarrow.fs.LocalFileSystem() with local.open_input_file(archive) as f: with pyarrow.RecordBatchFileReader(f) as reader: table = reader.read_all() num_messages = table.num_rows key_bytes = 0 value_bytes = 0 num_ops: typing.DefaultDict[str, int] = collections.defaultdict(int) for i in range(0, table.num_rows): key = table["key"][i].as_py() value = table["value"][i].as_py() timestamp = table["timestamp"][i].value if key: key_bytes += len(key) if value: value_bytes += len(value) # Strip the first 5 bytes, as they are added by Confluent Platform and are not part of the # actual serialized data decoded = { "key": decode(key_reader, key[5:]), "timestamp": timestamp, "value": decode(value_reader, value[5:]), } op = decoded["value"]["op"] num_ops[op] += 1 print( f"{topic},{num_messages},{key_bytes},{value_bytes},{num_ops['c']},{num_ops['u']},{num_ops['d']}" )
def test_write_empty_ipc_file(): # ARROW-3894: IPC file was not being properly initialized when no record # batches are being written schema = pa.schema([('field', pa.int64())]) sink = pa.BufferOutputStream() with pa.ipc.new_file(sink, schema): pass buf = sink.getvalue() with pa.RecordBatchFileReader(pa.BufferReader(buf)) as reader: table = reader.read_all() assert len(table) == 0 assert table.schema.equals(schema)
def test_open_file_from_buffer(file_fixture): # ARROW-2859; APIs accept the buffer protocol _, batches = file_fixture.write_batches() source = file_fixture.get_source() reader1 = pa.open_file(source) reader2 = pa.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3)
def loads(self, obj): """ Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series followed by a dictionary containing length of the loaded batches. """ import pyarrow as pa reader = pa.RecordBatchFileReader(pa.BufferReader(obj)) batches = [ reader.get_batch(i) for i in xrange(reader.num_record_batches) ] # NOTE: a 0-parameter pandas_udf will produce an empty batch that can have num_rows set num_rows = sum((batch.num_rows for batch in batches)) table = pa.Table.from_batches(batches) return [c.to_pandas() for c in table.itercolumns()] + [{ "length": num_rows }]
def deserialize_pandas(buf, nthreads=1): """Deserialize a buffer protocol compatible object into a pandas DataFrame. Parameters ---------- buf : buffer An object compatible with the buffer protocol nthreads : int, optional The number of threads to use to convert the buffer to a DataFrame. Returns ------- df : pandas.DataFrame """ buffer_reader = pa.BufferReader(buf) reader = pa.RecordBatchFileReader(buffer_reader) table = reader.read_all() return table.to_pandas(nthreads=nthreads)
def test_open_file_from_buffer(file_fixture): # ARROW-2859; APIs accept the buffer protocol file_fixture.write_batches() source = file_fixture.get_source() reader1 = pa.ipc.open_file(source) reader2 = pa.ipc.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3) st1 = reader1.stats assert st1.num_messages == 6 assert st1.num_record_batches == 5 assert reader2.stats == st1 assert reader3.stats == st1
def from_examples( fields: Union[Dict[str, Field], List[Field], Tuple[Field]], examples: Iterable[Example], cache_path: Optional[str] = None, data_types: Dict[str, Tuple[pa.DataType, pa.DataType]] = None, chunk_size=1024, ) -> "DiskBackedDataset": """ Creates a DiskBackedDataset from the provided Examples. Parameters ---------- fields: Union[Dict[str, Field], List[Field]] Dict or List of Fields used to create the Examples. examples: Iterable[Example] Iterable of examples. cache_path: Optional[str] Path to the directory where the cache file will saved. The whole directory will be used as the cache and will be deleted when `delete_cache` is called. It is recommended to create a new directory to use exclusively as the cache, or to leave this as None. If None, a temporary directory will be created. data_types: Dict[str, Tuple[pyarrow.DataType, pyarrow.DataType]] Dictionary mapping field names to pyarrow data types. This is required when a field can have missing data and the data type can't be inferred. The data type tuple has two values, corresponding to the raw and tokenized data types in an example. None can be used as a wildcard data type and will be overridden by an inferred data type if possible. chunk_size: int Maximum number of examples to be loaded before dumping to the on-disk cache file. Use lower number if memory usage is an issue while loading. Returns ------- DiskBackedDataset DiskBackedDataset instance created from the passed Examples. """ fields = unpack_fields(fields) if cache_path is None: cache_path = tempfile.mkdtemp(prefix=TEMP_CACHE_FILENAME_PREFIX) # dump dataset table cache_table_path = os.path.join(cache_path, CACHE_TABLE_FILENAME) # TODO hande cache case when cache is present chunks_iter = _chunkify(examples, chunk_size) # get first chunk to infer schema first_chunk = next(chunks_iter) record_batch = DiskBackedDataset._examples_to_recordbatch( first_chunk, fields, data_types ) inferred_data_types = DiskBackedDataset._schema_to_data_types(record_batch.schema) # check for missing data types in inferred schema DiskBackedDataset._check_for_missing_data_types(fields, inferred_data_types) # write cache file to disk with pa.OSFile(cache_table_path, "wb") as f: with pa.RecordBatchFileWriter(f, schema=record_batch.schema) as writer: writer.write(record_batch) # write first chunk for examples_chunk in chunks_iter: # write rest of chunks record_batch = DiskBackedDataset._examples_to_recordbatch( examples_chunk, fields, inferred_data_types ) writer.write(record_batch) mmapped_file = pa.memory_map(cache_table_path) table = pa.RecordBatchFileReader(mmapped_file).read_all() return DiskBackedDataset( table, fields, cache_path, mmapped_file, inferred_data_types )
def read_table(filename): filename = filename.numpy().decode("utf-8") reader = pa.RecordBatchFileReader(filename) return reader.read_all()
#!/usr/bin/env python import pyarrow as pa with pa.OSFile("/tmp/filtered.arrow") as source: writer = pa.RecordBatchFileReader(source) print(writer.get_record_batch(0).to_pandas())
def from_file(_): reader = pa.RecordBatchFileReader(f.name) t = reader.read_all() tio = IOTensor.from_arrow(t, spec=spec) return tio(column).to_tensor()
# Stats schema stats_schema = pa.schema([pa.field('stats', pa.uint64(), nullable=False)]).add_metadata({ b'fletcher_mode': b'write', b'fletcher_name': b'Stats' }) pa.output_stream('stats.as').write(stats_schema.serialize()) # If a recordbatch is provided as test case input, trim it and pass it to # fletchgen instead of the schema. if len(sys.argv) > 1: with open(sys.argv[1], 'rb') as fil: tab = pa.RecordBatchFileReader(fil).read_all() if len(sys.argv) > 2: tab = tab.slice(0, int(sys.argv[2])) with open('pages.rb', 'wb') as fil: with pa.RecordBatchFileWriter(fil, pages_schema) as writer: writer.write_table(tab) pages_args = ['-r', 'pages.rb', '-s', 'vhdl/memory.srec'] else: pages_args = ['-i', 'pages.as'] # Run fletchgen subprocess.run( ['fletchgen'] + pages_args + ['-i', 'result.as', '-i', 'stats.as', '-n', 'WordMatch', '--sim', '--axi'])
def load_pyarrow_table(filePath): with pa.OSFile(filePath, 'rb') as f: reader = pa.RecordBatchFileReader(f) return reader.read_all()
def arrowfile_to_dataframe(file_format): ''' Arrow file format to Pandas DataFrame ''' return pa.RecordBatchFileReader( pa.BufferReader(file_format)).read_all().to_pandas()
# limitations under the License. import pyarrow as pa import pyfletcher as pf import numpy as np import timeit import sys import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("recordbatch_path") args = parser.parse_args() # Set up a RecordBatch reader and read the RecordBatch. reader = pa.RecordBatchFileReader(args.recordbatch_path) batch = reader.get_batch(0) platform = pf.Platform( ) # Create an interface to an auto-detected FPGA Platform. platform.init() # Initialize the Platform. context = pf.Context( platform) # Create a Context for our data on the Platform. context.queue_record_batch(batch) # Queue the RecordBatch to the Context. context.enable( ) # Enable the Context, (potentially transferring the data to FPGA). kernel = pf.Kernel( context) # Set up an interface to the Kernel, supplying the Context. kernel.start() # Start the kernel.
def loads(self, obj): import pyarrow as pa reader = pa.RecordBatchFileReader(pa.BufferReader(obj)) return reader.read_all()
def arrow_data_collection(path): with pa.RecordBatchFileReader(path) as reader: batch = reader.get_batch(0) #print(batch.to_pandas()) return batch