Ejemplo n.º 1
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_native_file():
    data = b"some test data\n" * 10 + b"eof\n"
    gz_data = gzip_compress(data)
    reader = pa.BufferReader(gz_data)
    stream = pa.input_stream(reader)
    assert stream is reader
    reader = pa.BufferReader(gz_data)
    stream = pa.input_stream(reader, compression='gzip')
    assert stream.read() == data
Ejemplo n.º 2
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_buffer():
    data = b"some test data\n" * 10 + b"eof\n"
    for arg in [pa.py_buffer(data), memoryview(data)]:
        stream = pa.input_stream(arg)
        assert stream.read() == data

    gz_data = gzip_compress(data)
    stream = pa.input_stream(memoryview(gz_data))
    assert stream.read() == gz_data
    stream = pa.input_stream(memoryview(gz_data), compression='gzip')
    assert stream.read() == data
Ejemplo n.º 3
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_file_path(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    file_path = tmpdir / 'input_stream'
    with open(str(file_path), 'wb') as f:
        f.write(data)

    stream = pa.input_stream(file_path)
    assert stream.read() == data
    stream = pa.input_stream(str(file_path))
    assert stream.read() == data
    stream = pa.input_stream(pathlib.Path(str(file_path)))
    assert stream.read() == data
Ejemplo n.º 4
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_file_path_compressed_and_buffered(tmpdir):
    data = b"some test data\n" * 100 + b"eof\n"
    gz_data = gzip_compress(data)
    file_path = tmpdir / 'input_stream_compressed_and_buffered.gz'
    with open(str(file_path), 'wb') as f:
        f.write(gz_data)

    stream = pa.input_stream(file_path, buffer_size=32, compression='gzip')
    assert stream.read() == data
    stream = pa.input_stream(str(file_path), buffer_size=64)
    assert stream.read() == data
    stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
    assert stream.read() == data
Ejemplo n.º 5
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_file_path_compressed(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    gz_data = gzip_compress(data)
    file_path = tmpdir / 'input_stream.gz'
    with open(str(file_path), 'wb') as f:
        f.write(gz_data)

    stream = pa.input_stream(file_path)
    assert stream.read() == data
    stream = pa.input_stream(str(file_path))
    assert stream.read() == data
    stream = pa.input_stream(pathlib.Path(str(file_path)))
    assert stream.read() == data

    stream = pa.input_stream(file_path, compression='gzip')
    assert stream.read() == data
    stream = pa.input_stream(file_path, compression=None)
    assert stream.read() == gz_data
Ejemplo n.º 6
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_python_file(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    bio = BytesIO(data)

    stream = pa.input_stream(bio)
    assert stream.read() == data

    gz_data = gzip_compress(data)
    bio = BytesIO(gz_data)
    stream = pa.input_stream(bio)
    assert stream.read() == gz_data
    bio.seek(0)
    stream = pa.input_stream(bio, compression='gzip')
    assert stream.read() == data

    file_path = tmpdir / 'input_stream'
    with open(str(file_path), 'wb') as f:
        f.write(data)
    with open(str(file_path), 'rb') as f:
        stream = pa.input_stream(f)
        assert stream.read() == data
Ejemplo n.º 7
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_errors(tmpdir):
    buf = memoryview(b"")
    with pytest.raises(ValueError):
        pa.input_stream(buf, compression="foo")

    for arg in [bytearray(), StringIO()]:
        with pytest.raises(TypeError):
            pa.input_stream(arg)

    with pytest.raises(IOError):
        pa.input_stream("non_existent_file")

    with open(str(tmpdir / 'new_file'), 'wb') as f:
        with pytest.raises(TypeError, match="readable file expected"):
            pa.input_stream(f)
Ejemplo n.º 8
0
def test_input_stream_duck_typing():
    # Accept objects having the right file-like methods...
    class DuckReader:
        def close(self):
            pass

        @property
        def closed(self):
            return False

        def read(self, nbytes=None):
            return b'hello'

    stream = pa.input_stream(DuckReader())
    assert stream.read(5) == b'hello'
Ejemplo n.º 9
0
def test_input_stream_file_path_buffered(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    file_path = tmpdir / 'input_stream.buffered'
    with open(str(file_path), 'wb') as f:
        f.write(data)

    stream = pa.input_stream(file_path, buffer_size=32)
    assert stream.read() == data
    stream = pa.input_stream(str(file_path), buffer_size=64)
    assert stream.read() == data
    stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
    assert stream.read() == data

    unbuffered_stream = pa.input_stream(file_path, buffer_size=0)
    assert isinstance(unbuffered_stream, pa.OSFile)

    msg = 'Buffer size must be larger than zero'
    with pytest.raises(ValueError, match=msg):
        pa.input_stream(file_path, buffer_size=-1)
    with pytest.raises(TypeError):
        pa.input_stream(file_path, buffer_size='million')
Ejemplo n.º 10
0
Archivo: test_io.py Proyecto: rok/arrow
def test_input_stream_file_path_buffered(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    file_path = tmpdir / 'input_stream.buffered'
    with open(str(file_path), 'wb') as f:
        f.write(data)

    stream = pa.input_stream(file_path, buffer_size=32)
    assert stream.read() == data
    stream = pa.input_stream(str(file_path), buffer_size=64)
    assert stream.read() == data
    stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
    assert stream.read() == data

    unbuffered_stream = pa.input_stream(file_path, buffer_size=0)
    assert isinstance(unbuffered_stream, pa.OSFile)

    msg = 'Buffer size must be larger than zero'
    with pytest.raises(ValueError, match=msg):
        pa.input_stream(file_path, buffer_size=-1)
    with pytest.raises(TypeError):
        pa.input_stream(file_path, buffer_size='million')
Ejemplo n.º 11
0
def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
    in_memory_stream = pa.input_stream(filename)
    opened_stream = pa.ipc.open_stream(in_memory_stream)
    pa_table = opened_stream.read_all()
    return pa_table
Ejemplo n.º 12
0
#! /usr/bin/env python3

# NOTE: This script is also embedded in doc/cli/vast-export-arrow.md.
# When updating this file, please also update the embedded snippet.

# Example usage:
# vast -N export arrow '#type ~ /suricata.*/' | ./scripts/print-arrow.py

import sys
import pyarrow

# Open stdin in binary mode.
istream = pyarrow.input_stream(sys.stdin.buffer)
batch_count = 0
row_count = 0

# An Arrow reader consumes a stream of batches with the same schema. When
# reading the result for a query that returns multiple schemas, VAST will use
# multiple writers. Hence, we try to open record batch readers until an
# exception occurs.
try:
    while True:
        print("open next reader")
        reader = pyarrow.ipc.RecordBatchStreamReader(istream)
        try:
            while True:
                batch = reader.read_next_batch()
                batch_count += 1
                row_count += batch.num_rows
                print(str(batch.schema))
        except StopIteration:
Ejemplo n.º 13
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))
Ejemplo n.º 14
0
#   this allocates a resizable Buffer from the default memory pool (just like malloc and free in C)
buf = pa.allocate_buffer(1024, resizable=True)
print(pa.total_allocated_bytes())  # 1024
buf.resize(2048)
print(pa.total_allocated_bytes())  # 2048
buf = None  # the buffer will be garbaged-collected, and all of the memory is freed
print(pa.total_allocated_bytes())  # 0

# High-Level API for instantiating streams
# 1) Input Streams: input_stream([buf]):
#   this allows creating a readable NativeFile from various kinds of sources
# a) if passed a Buffer or a memoryview object:
data = b'some data'
buf = pa.py_buffer(data)  # a Buffer: this does not allocate any memory
#buf = memoryview(b"some data") # a memoryview object
stream = pa.input_stream(buf)
print(stream.read(4))  # b'some'
# b) if passed a string or file path, it will open the given file on disk for reading, creating a OSFile
import gzip
with gzip.open('example.gz', 'wb') as fout:
    fout.write(b'some data\n' * 3)
stream = pa.input_stream('example.gz')
print(stream.read())  # b'some data\nsome data\nsome data\n'

# 2) Output Streams: output_stream([]):
#   this is the equivalent function for output streams and allows creating a writable NativeFile
#   just like input_stream(), it is able to write to buffers or do on-the-fly compression
with pa.output_stream('example1.dat') as stream:
    stream.write(b'some data')
fin = open('example1.dat', 'rb')
print(fin.read())  # b'some data'
Ejemplo n.º 15
0
 def load(self, f):
     if hasattr(f, "read_buffer") and callable(f.read_buffer):
         return f.read_buffer()
     else:
         return pyarrow.input_stream(f).read_buffer()