def create_writer(url, schema, compression=None): parts = urllib.parse.urlparse(url) # S3 if parts.scheme == 's3': bucket = parts.netloc key = parts.path[1:] buf = pyarrow.BufferOutputStream() stream = pyarrow.output_stream(buf, compression=compression) writer = pyarrow.RecordBatchStreamWriter(stream, schema) try: yield writer except GeneratorExit: writer.close() stream.close() Driver.s3_client().put_object(Body=buf.getvalue().to_pybytes(), Bucket=bucket, Key=key) # File System elif parts.scheme == 'file': path = os.path.join(parts.netloc, parts.path) stream = pyarrow.output_stream(path, compression=compression) writer = pyarrow.ipc.RecordBatchStreamWriter(stream, schema) try: yield writer except GeneratorExit: writer.close() stream.close() else: raise Exception('URL {} not supported'.format(url))
def test_output_stream_buffer(): data = b"some test data\n" * 10 + b"eof\n" buf = bytearray(len(data)) stream = pa.output_stream(pa.py_buffer(buf)) stream.write(data) assert buf == data buf = bytearray(len(data)) stream = pa.output_stream(memoryview(buf)) stream.write(data) assert buf == data
def generate_schema_files(num_parsers): files = [] for i in range(0, num_parsers): file_in = "schemas/in_{:02}.as".format(i) file_out = "schemas/out_{:02}.as".format(i) schema_in = input_schema(i) schema_out = output_schema(i) pa.output_stream(file_in).write(schema_in.serialize()) pa.output_stream(file_out).write(schema_out.serialize()) files.append(file_in) files.append(file_out) return files
def __iter__(self): stream = pa.output_stream(self.filename) for item in self.parent: tensor = pa.Tensor.from_numpy(item) pa.write_tensor(tensor, stream) yield stream.close()
def test_output_stream_file_path_buffered(tmpdir): data = b"some test data\n" * 10 + b"eof\n" file_path = tmpdir / 'output_stream.buffered' def check_data(file_path, data, **kwargs): with pa.output_stream(file_path, **kwargs) as stream: if kwargs.get('buffer_size', 0) > 0: assert isinstance(stream, pa.BufferedOutputStream) stream.write(data) with open(str(file_path), 'rb') as f: return f.read() unbuffered_stream = pa.output_stream(file_path, buffer_size=0) assert isinstance(unbuffered_stream, pa.OSFile) msg = 'Buffer size must be larger than zero' with pytest.raises(ValueError, match=msg): assert check_data(file_path, data, buffer_size=-128) == data assert check_data(file_path, data, buffer_size=32) == data assert check_data(file_path, data, buffer_size=1024) == data assert check_data(str(file_path), data, buffer_size=32) == data result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32) assert result == data
def check_data(file_path, data, **kwargs): with pa.output_stream(file_path, **kwargs) as stream: if kwargs.get('buffer_size', 0) > 0: assert isinstance(stream, pa.BufferedOutputStream) stream.write(data) with open(str(file_path), 'rb') as f: return f.read()
def check_data(file_path, data, **kwargs): stream = pa.output_stream(file_path, **kwargs) stream.write(data) del stream gc.collect() with open(str(file_path), 'rb') as f: return f.read()
def check_data(data, **kwargs): # XXX cannot use BytesIO because stream.close() is necessary # to finish writing compressed data, but it will also close the # underlying BytesIO fn = str(tmpdir / 'output_stream_file') with open(fn, 'wb') as f: with pa.output_stream(f, **kwargs) as stream: stream.write(data) with open(fn, 'rb') as f: return f.read()
def time_buffered_writes(self, latency): test_data = b'x' * self.increment bytes_written = 0 out = pa.BufferOutputStream() slow_out = HighLatencyWriter(out, latency) buffered_out = pa.output_stream(slow_out, buffer_size=self.buffer_size) while bytes_written < self.total_size: buffered_out.write(test_data) bytes_written += self.increment buffered_out.flush()
def test_tensor_ipc_read_from_compressed(tempdir): # ARROW-5910 data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) path = tempdir / 'tensor-compressed-file' out_stream = pa.output_stream(path, compression='gzip') pa.ipc.write_tensor(tensor, out_stream) out_stream.close() result = pa.ipc.read_tensor(pa.input_stream(path, compression='gzip')) assert result.equals(tensor)
def test_message_read_from_compressed(example_messages): # Part of ARROW-5910 _, messages = example_messages for message in messages: raw_out = pa.BufferOutputStream() with pa.output_stream(raw_out, compression='gzip') as compressed_out: message.serialize_to(compressed_out) compressed_buf = raw_out.getvalue() result = pa.ipc.read_message( pa.input_stream(compressed_buf, compression='gzip')) assert result.equals(message)
def write_columns(self, name, type, columns): path = self.datadir + '/' + self.outputs[name] self.write_type(path, type) schema = pa.schema( [pa.field(name, a.type) for (name, a) in columns.items()]) t = pa.Table.from_arrays(list(columns.values()), schema=schema) with pa.output_stream(path + '/data.arrow') as sink: writer = pa.RecordBatchFileWriter(sink, t.schema) batches = t.to_batches(max_chunksize=len(t)) if batches: assert len(batches) == 1 writer.write_batch(batches[0]) writer.close() with open(path + '/_SUCCESS', 'w'): pass
def test_ipc_format(tempdir): table = pa.table({'a': pa.array([1, 2, 3], type="int8"), 'b': pa.array([.1, .2, .3], type="float64")}) path = str(tempdir / 'test.arrow') with pa.output_stream(path) as sink: writer = pa.RecordBatchFileWriter(sink, table.schema) writer.write_batch(table.to_batches()[0]) writer.close() dataset = ds.dataset(path, format=ds.IpcFileFormat()) result = dataset.to_table() assert result.equals(table) dataset = ds.dataset(path, format="ipc") result = dataset.to_table() assert result.equals(table)
def generate_schema_files(num_parsers): files = [] file_out_hw = "schemas/out_.as" pa.output_stream(file_out_hw).write(hw_output_schema.serialize()) files.append(file_out_hw) file_out_sw = "schemas/out_sw.as" pa.output_stream(file_out_sw).write(sw_output_schema.serialize()) for i in range(0, num_parsers): file_in = "schemas/in_{:02}.as".format(i) schema_in = input_schema(i) pa.output_stream(file_in).write(schema_in.serialize()) files.append(file_in) return files
def test_output_stream_duck_typing(): # Accept objects having the right file-like methods... class DuckWriter: def __init__(self): self.buf = pa.BufferOutputStream() def close(self): pass @property def closed(self): return False def write(self, data): self.buf.write(data) duck_writer = DuckWriter() stream = pa.output_stream(duck_writer) assert stream.write(b'hello') assert duck_writer.buf.getvalue().to_pybytes() == b'hello'
def test_output_stream_errors(tmpdir): buf = memoryview(bytearray()) with pytest.raises(ValueError): pa.output_stream(buf, compression="foo") for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): pa.output_stream(arg) fn = str(tmpdir / 'new_file') with open(fn, 'wb') as f: pass with open(fn, 'rb') as f: with pytest.raises(TypeError, match="writable file expected"): pa.output_stream(f)
def test_output_stream_file_path_buffered(tmpdir): data = b"some test data\n" * 10 + b"eof\n" file_path = tmpdir / 'output_stream.buffered' def check_data(file_path, data, **kwargs): with pa.output_stream(file_path, **kwargs) as stream: stream.write(data) with open(str(file_path), 'rb') as f: return f.read() unbuffered_stream = pa.output_stream(file_path, buffer_size=0) assert isinstance(unbuffered_stream, pa.OSFile) msg = 'Buffer size must be larger than zero' with pytest.raises(ValueError, match=msg): assert check_data(file_path, data, buffer_size=-128) == data assert check_data(file_path, data, buffer_size=32) == data assert check_data(file_path, data, buffer_size=1024) == data assert check_data(str(file_path), data, buffer_size=32) == data result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32) assert result == data
def check_data(file_path, data): with pa.output_stream(file_path) as stream: stream.write(data) with open(str(file_path), 'rb') as f: assert f.read() == data
def check_data(file_path, data, **kwargs): with pa.output_stream(file_path, **kwargs) as stream: stream.write(data) with open(str(file_path), 'rb') as f: return f.read()
def run(self): stream = pa.output_stream(self.filename) for item in self.parent: buf = pa.serialize(item).to_buffer() stream.write(buf) stream.close()
def memory_and_io_interfaces_example(): # pyarrow.Buffer. data = b"abcdefghijklmnopqrstuvwxyz" # Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object. buf = pa.py_buffer(data) # External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function. #buf = pa.foreign_buffer(data) print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) print("memoryview(buf) = {}.".format(memoryview(buf))) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # Memory pools. print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = pa.allocate_buffer(1024, resizable=True) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf.resize(2048) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = None print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name)) #-------------------- # Input and output streams. buf = memoryview(b"some data") stream = pa.input_stream(buf) print("stream.read(4) = {}.".format(stream.read(4))) import gzip with gzip.open("./example.gz", "wb") as f: f.write(b"some data\n" * 3) stream = pa.input_stream("./example.gz") print("stream.read() = {}.".format(stream.read())) with pa.output_stream("./example1.dat") as stream: stream.write(b"some data") f = open("./example1.dat", "rb") print("f.read() = {}.".format(f.read())) #-------------------- # On-disk and memory mapped files. # Using regular Python. with open("./example2.dat", "wb") as f: f.write(b"some example data") file_obj = pa.OSFile("./example2.dat") print("file_obj.read(4) = {}.".format(file_obj.read(4))) # Using pyarrow's OSFile class. with pa.OSFile("./example3.dat", "wb") as f: f.write(b"some example data") mmap = pa.memory_map("./example3.dat") print("mmap.read(4) = {}.".format(mmap.read(4))) mmap.seek(0) buf = mmap.read_buffer(4) print("buf = {}.".format(buf)) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # In-memory reading and writing. writer = pa.BufferOutputStream() writer.write(b"hello, friends") buf = writer.getvalue() print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) reader = pa.BufferReader(buf) reader.seek(7) print("reader.read(7) = {}.".format(reader.read(7)))
import pyarrow as pa # Input schema and batch in_schema = pa.schema([pa.field('number', pa.int64(), nullable=False)]) \ .with_metadata({b'fletcher_mode': b'read', b'fletcher_name': b'in'}) in_data = [pa.array([1, -3, 3, -7])] in_batch = pa.RecordBatch.from_arrays(in_data, schema=in_schema) # Create an Arrow RecordBatchFileWriter. writer = pa.RecordBatchFileWriter('in.rb', in_schema) writer.write(in_batch) writer.close() # Output schema and batch out_schema = pa.schema([pa.field('number', pa.int64(), nullable=False)]) \ .with_metadata({b'fletcher_mode': b'write', b'fletcher_name': b'out'}) pa.output_stream('out.as').write(out_schema.serialize())
import pyarrow as pa input_schema = pa.schema([ pa.field("input", pa.uint8(), False).with_metadata({b'fletcher_epc': b'8'}) ]).with_metadata({ b'fletcher_mode': b'read', b'fletcher_name': b'input' }) pa.output_stream("in.as").write(input_schema.serialize()) with pa.RecordBatchFileWriter('in.rb', input_schema) as writer: writer.write( pa.RecordBatch.from_arrays( [pa.array( [byte for byte in '{"voltage":[1128,1213,1850,429,1770,1683,1483,478,545,1555,867,1495,1398,1380,1753,438]}\n'.encode()], pa.uint8())], schema=input_schema) ) output_schema = pa.schema([ pa.field("voltage", pa.list_( pa.field("item", pa.uint64(), False).with_metadata( {"illex_MIN": "0", "illex_MAX": "2047"}) ), False).with_metadata( {"illex_MIN_LENGTH": "1", "illex_MAX_LENGTH": "16"} ) ]).with_metadata({ b'fletcher_mode': b'write', b'fletcher_name': b'output' })
import pyarrow as pa import subprocess import sys # Pages schema pages_title = pa.field('title', pa.utf8(), nullable=False) pages_text = pa.field('text', pa.binary(), nullable=False).add_metadata({b'fletcher_epc': b'8'}) pages_meta = {b'fletcher_mode': b'read', b'fletcher_name': b'Pages'} pages_schema = pa.schema([pages_title, pages_text]).add_metadata(pages_meta) pa.output_stream('pages.as').write(pages_schema.serialize()) # Result schema result_title = pa.field('title', pa.utf8(), nullable=False) result_count = pa.field('count', pa.uint32(), nullable=False) result_meta = {b'fletcher_mode': b'write', b'fletcher_name': b'Result'} result_schema = pa.schema([result_title, result_count]).add_metadata(result_meta) pa.output_stream('result.as').write(result_schema.serialize()) # Stats schema stats_schema = pa.schema([pa.field('stats', pa.uint64(), nullable=False)]).add_metadata({ b'fletcher_mode': b'write', b'fletcher_name': b'Stats' })
"illex_MAX": "4192" }), 13), False), pa.field( "large_speed_var", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "4192" }), 13), False), pa.field("accel_decel", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "4192" }), pa.field("speed_changes", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "4192" }) ] schema = pa.schema(schema_fields) serialized_schema = schema.serialize() pa.output_stream('tripreport.as').write(serialized_schema) print("Trip Report schema generated.")
data = b'some data' buf = pa.py_buffer(data) # a Buffer: this does not allocate any memory #buf = memoryview(b"some data") # a memoryview object stream = pa.input_stream(buf) print(stream.read(4)) # b'some' # b) if passed a string or file path, it will open the given file on disk for reading, creating a OSFile import gzip with gzip.open('example.gz', 'wb') as fout: fout.write(b'some data\n' * 3) stream = pa.input_stream('example.gz') print(stream.read()) # b'some data\nsome data\nsome data\n' # 2) Output Streams: output_stream([]): # this is the equivalent function for output streams and allows creating a writable NativeFile # just like input_stream(), it is able to write to buffers or do on-the-fly compression with pa.output_stream('example1.dat') as stream: stream.write(b'some data') fin = open('example1.dat', 'rb') print(fin.read()) # b'some data' fin.close() # What is Memory-mapping? # it is a way for a process to access the file # a process can map a file's contents (or its subset) into its address space # this makes it easier to read from and write to the file, by reading and writing in memory # the file itself, on disk, is just the same as any other file # 3) On-Disk and Memory Mapped Files: OSFile([filepath], wb): # this allows to interact with data on disk # a) using standard operating system-level file APIs: with open('example2.dat', 'wb') as fout:
def dump(self, f, buffer_like): with pyarrow.output_stream(f) as out_f: out_f.write(buffer_like)
def dump(self, f, array): with pyarrow.output_stream(f) as f: f.writelines(repr(item) for item in array)
#!/usr/bin/python import pandas as pd import pyarrow as pa import pyarrow.feather as feather data = pd.read_csv("stocks.csv", index_col='date', parse_dates=True) arrow_data = pa.Table.from_pandas(data) feather.write_feather(data, "stocks.pyarrow.feather") with pa.output_stream("stocks.pyarrow.stream") as f: batch = pa.record_batch(data) writer = pa.ipc.new_stream(f, batch.schema) writer.write_batch(batch) data = pd.read_csv("../../data/ames-house-prices/train.csv") arrow_data = pa.Table.from_pandas(data) with pa.output_stream("ames.pyarrow.stream") as f: batch = pa.record_batch(data) writer = pa.ipc.new_stream(f, batch.schema) writer.write_batch(batch)
import pyarrow as pa output_schema = pa.schema([ pa.field( "voltage", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "2047" })), False).with_metadata({ "illex_MIN_LENGTH": "1", "illex_MAX_LENGTH": "16" }) ]) pa.output_stream("battery.as").write(output_schema.serialize()) print("Battery Status schema generated.")
import pyarrow as pa # Create a field that can be interpreted as a "listprim" ArrayReader/Writer vec_field = pa.field('vec', pa.list_(pa.field('elem', pa.int16(), nullable=False)), nullable=False) schema_src = pa.schema([vec_field]).with_metadata({ b'fletcher_mode': b'read', b'fletcher_name': b'src' }) schema_dst = pa.schema([vec_field]).with_metadata({ b'fletcher_mode': b'write', b'fletcher_name': b'dst' }) data = [pa.array([[1, -3, 3, -7], [4, 5, 10]])] recordbatch = pa.RecordBatch.from_arrays(data, schema=schema_src) writer_in = pa.RecordBatchFileWriter('src.rb', schema_src) writer_in.write(recordbatch) writer_in.close() serialized_out_schema = schema_dst.serialize() pa.output_stream('dst.as').write(serialized_out_schema)