def test_input_stream_native_file(): data = b"some test data\n" * 10 + b"eof\n" gz_data = gzip_compress(data) reader = pa.BufferReader(gz_data) stream = pa.input_stream(reader) assert stream is reader reader = pa.BufferReader(gz_data) stream = pa.input_stream(reader, compression='gzip') assert stream.read() == data
def test_input_stream_buffer(): data = b"some test data\n" * 10 + b"eof\n" for arg in [pa.py_buffer(data), memoryview(data)]: stream = pa.input_stream(arg) assert stream.read() == data gz_data = gzip_compress(data) stream = pa.input_stream(memoryview(gz_data)) assert stream.read() == gz_data stream = pa.input_stream(memoryview(gz_data), compression='gzip') assert stream.read() == data
def test_input_stream_file_path(tmpdir): data = b"some test data\n" * 10 + b"eof\n" file_path = tmpdir / 'input_stream' with open(str(file_path), 'wb') as f: f.write(data) stream = pa.input_stream(file_path) assert stream.read() == data stream = pa.input_stream(str(file_path)) assert stream.read() == data stream = pa.input_stream(pathlib.Path(str(file_path))) assert stream.read() == data
def test_input_stream_file_path_compressed_and_buffered(tmpdir): data = b"some test data\n" * 100 + b"eof\n" gz_data = gzip_compress(data) file_path = tmpdir / 'input_stream_compressed_and_buffered.gz' with open(str(file_path), 'wb') as f: f.write(gz_data) stream = pa.input_stream(file_path, buffer_size=32, compression='gzip') assert stream.read() == data stream = pa.input_stream(str(file_path), buffer_size=64) assert stream.read() == data stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024) assert stream.read() == data
def test_input_stream_file_path_compressed(tmpdir): data = b"some test data\n" * 10 + b"eof\n" gz_data = gzip_compress(data) file_path = tmpdir / 'input_stream.gz' with open(str(file_path), 'wb') as f: f.write(gz_data) stream = pa.input_stream(file_path) assert stream.read() == data stream = pa.input_stream(str(file_path)) assert stream.read() == data stream = pa.input_stream(pathlib.Path(str(file_path))) assert stream.read() == data stream = pa.input_stream(file_path, compression='gzip') assert stream.read() == data stream = pa.input_stream(file_path, compression=None) assert stream.read() == gz_data
def test_input_stream_python_file(tmpdir): data = b"some test data\n" * 10 + b"eof\n" bio = BytesIO(data) stream = pa.input_stream(bio) assert stream.read() == data gz_data = gzip_compress(data) bio = BytesIO(gz_data) stream = pa.input_stream(bio) assert stream.read() == gz_data bio.seek(0) stream = pa.input_stream(bio, compression='gzip') assert stream.read() == data file_path = tmpdir / 'input_stream' with open(str(file_path), 'wb') as f: f.write(data) with open(str(file_path), 'rb') as f: stream = pa.input_stream(f) assert stream.read() == data
def test_input_stream_errors(tmpdir): buf = memoryview(b"") with pytest.raises(ValueError): pa.input_stream(buf, compression="foo") for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): pa.input_stream(arg) with pytest.raises(IOError): pa.input_stream("non_existent_file") with open(str(tmpdir / 'new_file'), 'wb') as f: with pytest.raises(TypeError, match="readable file expected"): pa.input_stream(f)
def test_input_stream_duck_typing(): # Accept objects having the right file-like methods... class DuckReader: def close(self): pass @property def closed(self): return False def read(self, nbytes=None): return b'hello' stream = pa.input_stream(DuckReader()) assert stream.read(5) == b'hello'
def test_input_stream_file_path_buffered(tmpdir): data = b"some test data\n" * 10 + b"eof\n" file_path = tmpdir / 'input_stream.buffered' with open(str(file_path), 'wb') as f: f.write(data) stream = pa.input_stream(file_path, buffer_size=32) assert stream.read() == data stream = pa.input_stream(str(file_path), buffer_size=64) assert stream.read() == data stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024) assert stream.read() == data unbuffered_stream = pa.input_stream(file_path, buffer_size=0) assert isinstance(unbuffered_stream, pa.OSFile) msg = 'Buffer size must be larger than zero' with pytest.raises(ValueError, match=msg): pa.input_stream(file_path, buffer_size=-1) with pytest.raises(TypeError): pa.input_stream(file_path, buffer_size='million')
def _in_memory_arrow_table_from_file(filename: str) -> pa.Table: in_memory_stream = pa.input_stream(filename) opened_stream = pa.ipc.open_stream(in_memory_stream) pa_table = opened_stream.read_all() return pa_table
#! /usr/bin/env python3 # NOTE: This script is also embedded in doc/cli/vast-export-arrow.md. # When updating this file, please also update the embedded snippet. # Example usage: # vast -N export arrow '#type ~ /suricata.*/' | ./scripts/print-arrow.py import sys import pyarrow # Open stdin in binary mode. istream = pyarrow.input_stream(sys.stdin.buffer) batch_count = 0 row_count = 0 # An Arrow reader consumes a stream of batches with the same schema. When # reading the result for a query that returns multiple schemas, VAST will use # multiple writers. Hence, we try to open record batch readers until an # exception occurs. try: while True: print("open next reader") reader = pyarrow.ipc.RecordBatchStreamReader(istream) try: while True: batch = reader.read_next_batch() batch_count += 1 row_count += batch.num_rows print(str(batch.schema)) except StopIteration:
def memory_and_io_interfaces_example(): # pyarrow.Buffer. data = b"abcdefghijklmnopqrstuvwxyz" # Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object. buf = pa.py_buffer(data) # External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function. #buf = pa.foreign_buffer(data) print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) print("memoryview(buf) = {}.".format(memoryview(buf))) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # Memory pools. print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = pa.allocate_buffer(1024, resizable=True) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf.resize(2048) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = None print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name)) #-------------------- # Input and output streams. buf = memoryview(b"some data") stream = pa.input_stream(buf) print("stream.read(4) = {}.".format(stream.read(4))) import gzip with gzip.open("./example.gz", "wb") as f: f.write(b"some data\n" * 3) stream = pa.input_stream("./example.gz") print("stream.read() = {}.".format(stream.read())) with pa.output_stream("./example1.dat") as stream: stream.write(b"some data") f = open("./example1.dat", "rb") print("f.read() = {}.".format(f.read())) #-------------------- # On-disk and memory mapped files. # Using regular Python. with open("./example2.dat", "wb") as f: f.write(b"some example data") file_obj = pa.OSFile("./example2.dat") print("file_obj.read(4) = {}.".format(file_obj.read(4))) # Using pyarrow's OSFile class. with pa.OSFile("./example3.dat", "wb") as f: f.write(b"some example data") mmap = pa.memory_map("./example3.dat") print("mmap.read(4) = {}.".format(mmap.read(4))) mmap.seek(0) buf = mmap.read_buffer(4) print("buf = {}.".format(buf)) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # In-memory reading and writing. writer = pa.BufferOutputStream() writer.write(b"hello, friends") buf = writer.getvalue() print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) reader = pa.BufferReader(buf) reader.seek(7) print("reader.read(7) = {}.".format(reader.read(7)))
# this allocates a resizable Buffer from the default memory pool (just like malloc and free in C) buf = pa.allocate_buffer(1024, resizable=True) print(pa.total_allocated_bytes()) # 1024 buf.resize(2048) print(pa.total_allocated_bytes()) # 2048 buf = None # the buffer will be garbaged-collected, and all of the memory is freed print(pa.total_allocated_bytes()) # 0 # High-Level API for instantiating streams # 1) Input Streams: input_stream([buf]): # this allows creating a readable NativeFile from various kinds of sources # a) if passed a Buffer or a memoryview object: data = b'some data' buf = pa.py_buffer(data) # a Buffer: this does not allocate any memory #buf = memoryview(b"some data") # a memoryview object stream = pa.input_stream(buf) print(stream.read(4)) # b'some' # b) if passed a string or file path, it will open the given file on disk for reading, creating a OSFile import gzip with gzip.open('example.gz', 'wb') as fout: fout.write(b'some data\n' * 3) stream = pa.input_stream('example.gz') print(stream.read()) # b'some data\nsome data\nsome data\n' # 2) Output Streams: output_stream([]): # this is the equivalent function for output streams and allows creating a writable NativeFile # just like input_stream(), it is able to write to buffers or do on-the-fly compression with pa.output_stream('example1.dat') as stream: stream.write(b'some data') fin = open('example1.dat', 'rb') print(fin.read()) # b'some data'
def load(self, f): if hasattr(f, "read_buffer") and callable(f.read_buffer): return f.read_buffer() else: return pyarrow.input_stream(f).read_buffer()