def create_writer(url, schema, compression=None):
        parts = urllib.parse.urlparse(url)

        # S3
        if parts.scheme == 's3':
            bucket = parts.netloc
            key = parts.path[1:]
            buf = pyarrow.BufferOutputStream()
            stream = pyarrow.output_stream(buf, compression=compression)
            writer = pyarrow.RecordBatchStreamWriter(stream, schema)

            try:
                yield writer
            except GeneratorExit:
                writer.close()
                stream.close()
                Driver.s3_client().put_object(Body=buf.getvalue().to_pybytes(),
                                              Bucket=bucket,
                                              Key=key)

        # File System
        elif parts.scheme == 'file':
            path = os.path.join(parts.netloc, parts.path)
            stream = pyarrow.output_stream(path, compression=compression)
            writer = pyarrow.ipc.RecordBatchStreamWriter(stream, schema)

            try:
                yield writer
            except GeneratorExit:
                writer.close()
                stream.close()

        else:
            raise Exception('URL {} not supported'.format(url))
Beispiel #2
0
def test_output_stream_buffer():
    data = b"some test data\n" * 10 + b"eof\n"
    buf = bytearray(len(data))
    stream = pa.output_stream(pa.py_buffer(buf))
    stream.write(data)
    assert buf == data

    buf = bytearray(len(data))
    stream = pa.output_stream(memoryview(buf))
    stream.write(data)
    assert buf == data
Beispiel #3
0
def test_output_stream_buffer():
    data = b"some test data\n" * 10 + b"eof\n"
    buf = bytearray(len(data))
    stream = pa.output_stream(pa.py_buffer(buf))
    stream.write(data)
    assert buf == data

    buf = bytearray(len(data))
    stream = pa.output_stream(memoryview(buf))
    stream.write(data)
    assert buf == data
Beispiel #4
0
def generate_schema_files(num_parsers):
    files = []
    for i in range(0, num_parsers):
        file_in = "schemas/in_{:02}.as".format(i)
        file_out = "schemas/out_{:02}.as".format(i)
        schema_in = input_schema(i)
        schema_out = output_schema(i)
        pa.output_stream(file_in).write(schema_in.serialize())
        pa.output_stream(file_out).write(schema_out.serialize())
        files.append(file_in)
        files.append(file_out)

    return files
Beispiel #5
0
 def __iter__(self):
     stream = pa.output_stream(self.filename)
     for item in self.parent:
         tensor = pa.Tensor.from_numpy(item)
         pa.write_tensor(tensor, stream)
         yield
     stream.close()
Beispiel #6
0
def test_output_stream_file_path_buffered(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    file_path = tmpdir / 'output_stream.buffered'

    def check_data(file_path, data, **kwargs):
        with pa.output_stream(file_path, **kwargs) as stream:
            if kwargs.get('buffer_size', 0) > 0:
                assert isinstance(stream, pa.BufferedOutputStream)
            stream.write(data)
        with open(str(file_path), 'rb') as f:
            return f.read()

    unbuffered_stream = pa.output_stream(file_path, buffer_size=0)
    assert isinstance(unbuffered_stream, pa.OSFile)

    msg = 'Buffer size must be larger than zero'
    with pytest.raises(ValueError, match=msg):
        assert check_data(file_path, data, buffer_size=-128) == data

    assert check_data(file_path, data, buffer_size=32) == data
    assert check_data(file_path, data, buffer_size=1024) == data
    assert check_data(str(file_path), data, buffer_size=32) == data

    result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32)
    assert result == data
Beispiel #7
0
 def check_data(file_path, data, **kwargs):
     with pa.output_stream(file_path, **kwargs) as stream:
         if kwargs.get('buffer_size', 0) > 0:
             assert isinstance(stream, pa.BufferedOutputStream)
         stream.write(data)
     with open(str(file_path), 'rb') as f:
         return f.read()
Beispiel #8
0
 def check_data(file_path, data, **kwargs):
     stream = pa.output_stream(file_path, **kwargs)
     stream.write(data)
     del stream
     gc.collect()
     with open(str(file_path), 'rb') as f:
         return f.read()
Beispiel #9
0
 def check_data(data, **kwargs):
     # XXX cannot use BytesIO because stream.close() is necessary
     # to finish writing compressed data, but it will also close the
     # underlying BytesIO
     fn = str(tmpdir / 'output_stream_file')
     with open(fn, 'wb') as f:
         with pa.output_stream(f, **kwargs) as stream:
             stream.write(data)
     with open(fn, 'rb') as f:
         return f.read()
Beispiel #10
0
 def check_data(data, **kwargs):
     # XXX cannot use BytesIO because stream.close() is necessary
     # to finish writing compressed data, but it will also close the
     # underlying BytesIO
     fn = str(tmpdir / 'output_stream_file')
     with open(fn, 'wb') as f:
         with pa.output_stream(f, **kwargs) as stream:
             stream.write(data)
     with open(fn, 'rb') as f:
         return f.read()
Beispiel #11
0
    def time_buffered_writes(self, latency):
        test_data = b'x' * self.increment
        bytes_written = 0
        out = pa.BufferOutputStream()
        slow_out = HighLatencyWriter(out, latency)
        buffered_out = pa.output_stream(slow_out, buffer_size=self.buffer_size)

        while bytes_written < self.total_size:
            buffered_out.write(test_data)
            bytes_written += self.increment
        buffered_out.flush()
Beispiel #12
0
def test_tensor_ipc_read_from_compressed(tempdir):
    # ARROW-5910
    data = np.random.randn(10, 4)
    tensor = pa.Tensor.from_numpy(data)

    path = tempdir / 'tensor-compressed-file'

    out_stream = pa.output_stream(path, compression='gzip')
    pa.ipc.write_tensor(tensor, out_stream)
    out_stream.close()

    result = pa.ipc.read_tensor(pa.input_stream(path, compression='gzip'))
    assert result.equals(tensor)
Beispiel #13
0
def test_message_read_from_compressed(example_messages):
    # Part of ARROW-5910
    _, messages = example_messages
    for message in messages:
        raw_out = pa.BufferOutputStream()
        with pa.output_stream(raw_out, compression='gzip') as compressed_out:
            message.serialize_to(compressed_out)

        compressed_buf = raw_out.getvalue()

        result = pa.ipc.read_message(
            pa.input_stream(compressed_buf, compression='gzip'))
        assert result.equals(message)
Beispiel #14
0
 def write_columns(self, name, type, columns):
     path = self.datadir + '/' + self.outputs[name]
     self.write_type(path, type)
     schema = pa.schema(
         [pa.field(name, a.type) for (name, a) in columns.items()])
     t = pa.Table.from_arrays(list(columns.values()), schema=schema)
     with pa.output_stream(path + '/data.arrow') as sink:
         writer = pa.RecordBatchFileWriter(sink, t.schema)
         batches = t.to_batches(max_chunksize=len(t))
         if batches:
             assert len(batches) == 1
             writer.write_batch(batches[0])
         writer.close()
     with open(path + '/_SUCCESS', 'w'):
         pass
Beispiel #15
0
def test_ipc_format(tempdir):
    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
                      'b': pa.array([.1, .2, .3], type="float64")})

    path = str(tempdir / 'test.arrow')
    with pa.output_stream(path) as sink:
        writer = pa.RecordBatchFileWriter(sink, table.schema)
        writer.write_batch(table.to_batches()[0])
        writer.close()

    dataset = ds.dataset(path, format=ds.IpcFileFormat())
    result = dataset.to_table()
    assert result.equals(table)

    dataset = ds.dataset(path, format="ipc")
    result = dataset.to_table()
    assert result.equals(table)
Beispiel #16
0
def generate_schema_files(num_parsers):
    files = []
    file_out_hw = "schemas/out_.as"
    pa.output_stream(file_out_hw).write(hw_output_schema.serialize())
    files.append(file_out_hw)
    file_out_sw = "schemas/out_sw.as"
    pa.output_stream(file_out_sw).write(sw_output_schema.serialize())
    for i in range(0, num_parsers):
        file_in = "schemas/in_{:02}.as".format(i)
        schema_in = input_schema(i)
        pa.output_stream(file_in).write(schema_in.serialize())
        files.append(file_in)
    return files
Beispiel #17
0
def test_output_stream_duck_typing():
    # Accept objects having the right file-like methods...
    class DuckWriter:
        def __init__(self):
            self.buf = pa.BufferOutputStream()

        def close(self):
            pass

        @property
        def closed(self):
            return False

        def write(self, data):
            self.buf.write(data)

    duck_writer = DuckWriter()
    stream = pa.output_stream(duck_writer)
    assert stream.write(b'hello')
    assert duck_writer.buf.getvalue().to_pybytes() == b'hello'
Beispiel #18
0
def test_output_stream_errors(tmpdir):
    buf = memoryview(bytearray())
    with pytest.raises(ValueError):
        pa.output_stream(buf, compression="foo")

    for arg in [bytearray(), StringIO()]:
        with pytest.raises(TypeError):
            pa.output_stream(arg)

    fn = str(tmpdir / 'new_file')
    with open(fn, 'wb') as f:
        pass
    with open(fn, 'rb') as f:
        with pytest.raises(TypeError, match="writable file expected"):
            pa.output_stream(f)
Beispiel #19
0
def test_output_stream_errors(tmpdir):
    buf = memoryview(bytearray())
    with pytest.raises(ValueError):
        pa.output_stream(buf, compression="foo")

    for arg in [bytearray(), StringIO()]:
        with pytest.raises(TypeError):
            pa.output_stream(arg)

    fn = str(tmpdir / 'new_file')
    with open(fn, 'wb') as f:
        pass
    with open(fn, 'rb') as f:
        with pytest.raises(TypeError, match="writable file expected"):
            pa.output_stream(f)
Beispiel #20
0
def test_output_stream_file_path_buffered(tmpdir):
    data = b"some test data\n" * 10 + b"eof\n"
    file_path = tmpdir / 'output_stream.buffered'

    def check_data(file_path, data, **kwargs):
        with pa.output_stream(file_path, **kwargs) as stream:
            stream.write(data)
        with open(str(file_path), 'rb') as f:
            return f.read()

    unbuffered_stream = pa.output_stream(file_path, buffer_size=0)
    assert isinstance(unbuffered_stream, pa.OSFile)

    msg = 'Buffer size must be larger than zero'
    with pytest.raises(ValueError, match=msg):
        assert check_data(file_path, data, buffer_size=-128) == data

    assert check_data(file_path, data, buffer_size=32) == data
    assert check_data(file_path, data, buffer_size=1024) == data
    assert check_data(str(file_path), data, buffer_size=32) == data

    result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32)
    assert result == data
Beispiel #21
0
 def check_data(file_path, data):
     with pa.output_stream(file_path) as stream:
         stream.write(data)
     with open(str(file_path), 'rb') as f:
         assert f.read() == data
Beispiel #22
0
 def check_data(file_path, data, **kwargs):
     with pa.output_stream(file_path, **kwargs) as stream:
         stream.write(data)
     with open(str(file_path), 'rb') as f:
         return f.read()
Beispiel #23
0
 def check_data(file_path, data, **kwargs):
     with pa.output_stream(file_path, **kwargs) as stream:
         stream.write(data)
     with open(str(file_path), 'rb') as f:
         return f.read()
Beispiel #24
0
 def run(self):
     stream = pa.output_stream(self.filename)
     for item in self.parent:
         buf = pa.serialize(item).to_buffer()
         stream.write(buf)
     stream.close()
Beispiel #25
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))
Beispiel #26
0
import pyarrow as pa

# Input schema and batch
in_schema = pa.schema([pa.field('number', pa.int64(), nullable=False)]) \
    .with_metadata({b'fletcher_mode': b'read', b'fletcher_name': b'in'})
in_data = [pa.array([1, -3, 3, -7])]
in_batch = pa.RecordBatch.from_arrays(in_data, schema=in_schema)
# Create an Arrow RecordBatchFileWriter.
writer = pa.RecordBatchFileWriter('in.rb', in_schema)
writer.write(in_batch)
writer.close()

# Output schema and batch
out_schema = pa.schema([pa.field('number', pa.int64(), nullable=False)]) \
    .with_metadata({b'fletcher_mode': b'write',
                    b'fletcher_name': b'out'})
pa.output_stream('out.as').write(out_schema.serialize())
Beispiel #27
0
import pyarrow as pa

input_schema = pa.schema([
    pa.field("input", pa.uint8(), False).with_metadata({b'fletcher_epc': b'8'})
]).with_metadata({
    b'fletcher_mode': b'read',
    b'fletcher_name': b'input'
})

pa.output_stream("in.as").write(input_schema.serialize())

with pa.RecordBatchFileWriter('in.rb', input_schema) as writer:
    writer.write(
        pa.RecordBatch.from_arrays(
            [pa.array(
                [byte for byte in '{"voltage":[1128,1213,1850,429,1770,1683,1483,478,545,1555,867,1495,1398,1380,1753,438]}\n'.encode()], pa.uint8())],
            schema=input_schema)
    )

output_schema = pa.schema([
    pa.field("voltage", pa.list_(
        pa.field("item", pa.uint64(), False).with_metadata(
            {"illex_MIN": "0", "illex_MAX": "2047"})
    ), False).with_metadata(
        {"illex_MIN_LENGTH": "1", "illex_MAX_LENGTH": "16"}
    )
]).with_metadata({
    b'fletcher_mode': b'write',
    b'fletcher_name': b'output'
})
Beispiel #28
0
import pyarrow as pa
import subprocess
import sys

# Pages schema
pages_title = pa.field('title', pa.utf8(), nullable=False)
pages_text = pa.field('text', pa.binary(),
                      nullable=False).add_metadata({b'fletcher_epc': b'8'})
pages_meta = {b'fletcher_mode': b'read', b'fletcher_name': b'Pages'}
pages_schema = pa.schema([pages_title, pages_text]).add_metadata(pages_meta)

pa.output_stream('pages.as').write(pages_schema.serialize())

# Result schema
result_title = pa.field('title', pa.utf8(), nullable=False)
result_count = pa.field('count', pa.uint32(), nullable=False)
result_meta = {b'fletcher_mode': b'write', b'fletcher_name': b'Result'}
result_schema = pa.schema([result_title,
                           result_count]).add_metadata(result_meta)

pa.output_stream('result.as').write(result_schema.serialize())

# Stats schema
stats_schema = pa.schema([pa.field('stats', pa.uint64(),
                                   nullable=False)]).add_metadata({
                                       b'fletcher_mode':
                                       b'write',
                                       b'fletcher_name':
                                       b'Stats'
                                   })
Beispiel #29
0
                "illex_MAX":
                "4192"
            }), 13), False),
    pa.field(
        "large_speed_var",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
                "illex_MIN":
                "0",
                "illex_MAX":
                "4192"
            }), 13), False),
    pa.field("accel_decel", pa.uint64(), False).with_metadata({
        "illex_MIN":
        "0",
        "illex_MAX":
        "4192"
    }),
    pa.field("speed_changes", pa.uint64(), False).with_metadata({
        "illex_MIN":
        "0",
        "illex_MAX":
        "4192"
    })
]

schema = pa.schema(schema_fields)
serialized_schema = schema.serialize()
pa.output_stream('tripreport.as').write(serialized_schema)

print("Trip Report schema generated.")
Beispiel #30
0
data = b'some data'
buf = pa.py_buffer(data)  # a Buffer: this does not allocate any memory
#buf = memoryview(b"some data") # a memoryview object
stream = pa.input_stream(buf)
print(stream.read(4))  # b'some'
# b) if passed a string or file path, it will open the given file on disk for reading, creating a OSFile
import gzip
with gzip.open('example.gz', 'wb') as fout:
    fout.write(b'some data\n' * 3)
stream = pa.input_stream('example.gz')
print(stream.read())  # b'some data\nsome data\nsome data\n'

# 2) Output Streams: output_stream([]):
#   this is the equivalent function for output streams and allows creating a writable NativeFile
#   just like input_stream(), it is able to write to buffers or do on-the-fly compression
with pa.output_stream('example1.dat') as stream:
    stream.write(b'some data')
fin = open('example1.dat', 'rb')
print(fin.read())  # b'some data'
fin.close()

# What is Memory-mapping?
#   it is a way for a process to access the file
#   a process can map a file's contents (or its subset) into its address space
#   this makes it easier to read from and write to the file, by reading and writing in memory
#   the file itself, on disk, is just the same as any other file

# 3) On-Disk and Memory Mapped Files: OSFile([filepath], wb):
#      this allows to interact with data on disk
# a) using standard operating system-level file APIs:
with open('example2.dat', 'wb') as fout:
Beispiel #31
0
 def dump(self, f, buffer_like):
     with pyarrow.output_stream(f) as out_f:
         out_f.write(buffer_like)
Beispiel #32
0
 def dump(self, f, array):
     with pyarrow.output_stream(f) as f:
         f.writelines(repr(item) for item in array)
#!/usr/bin/python
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather

data = pd.read_csv("stocks.csv", index_col='date', parse_dates=True)
arrow_data = pa.Table.from_pandas(data)
feather.write_feather(data, "stocks.pyarrow.feather")

with pa.output_stream("stocks.pyarrow.stream") as f:
    batch = pa.record_batch(data)
    writer = pa.ipc.new_stream(f, batch.schema)
    writer.write_batch(batch)

data = pd.read_csv("../../data/ames-house-prices/train.csv")
arrow_data = pa.Table.from_pandas(data)
with pa.output_stream("ames.pyarrow.stream") as f:
    batch = pa.record_batch(data)
    writer = pa.ipc.new_stream(f, batch.schema)
    writer.write_batch(batch)
Beispiel #34
0
 def check_data(file_path, data):
     with pa.output_stream(file_path) as stream:
         stream.write(data)
     with open(str(file_path), 'rb') as f:
         assert f.read() == data
Beispiel #35
0
import pyarrow as pa

output_schema = pa.schema([
    pa.field(
        "voltage",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
                "illex_MIN":
                "0",
                "illex_MAX":
                "2047"
            })), False).with_metadata({
                "illex_MIN_LENGTH": "1",
                "illex_MAX_LENGTH": "16"
            })
])

pa.output_stream("battery.as").write(output_schema.serialize())

print("Battery Status schema generated.")
Beispiel #36
0
import pyarrow as pa

# Create a field that can be interpreted as a "listprim" ArrayReader/Writer
vec_field = pa.field('vec',
                     pa.list_(pa.field('elem', pa.int16(), nullable=False)),
                     nullable=False)

schema_src = pa.schema([vec_field]).with_metadata({
    b'fletcher_mode': b'read',
    b'fletcher_name': b'src'
})
schema_dst = pa.schema([vec_field]).with_metadata({
    b'fletcher_mode': b'write',
    b'fletcher_name': b'dst'
})

data = [pa.array([[1, -3, 3, -7], [4, 5, 10]])]

recordbatch = pa.RecordBatch.from_arrays(data, schema=schema_src)
writer_in = pa.RecordBatchFileWriter('src.rb', schema_src)
writer_in.write(recordbatch)
writer_in.close()

serialized_out_schema = schema_dst.serialize()
pa.output_stream('dst.as').write(serialized_out_schema)