def test_output_stream_constructor(tmpdir): if not Codec.is_available("gzip"): pytest.skip("gzip support is not built") with pa.CompressedOutputStream(tmpdir / "ctor.gz", "gzip") as stream: stream.write(b"test") with (tmpdir / "ctor2.gz").open("wb") as f: with pa.CompressedOutputStream(f, "gzip") as stream: stream.write(b"test")
def test_compress_stream(batch): raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, "lz4") as compressed: pa.serialize(batch).write_to(compressed) cdata = raw.getvalue() raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, "lz4") as compressed: tmp = pa.deserialize(compressed.read())
def make_compressed_output(data, fn, compression): raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, compression) as compressed: assert not compressed.closed assert not compressed.readable() assert compressed.writable() assert not compressed.seekable() compressed.write(data) assert compressed.closed assert raw.closed with open(fn, "wb") as f: f.write(raw.getvalue())
def serialize_pyarrow(data: bytes, codec: str): """ Serialize an object and compress with a specific codec. Returns the serialized, compressed bytes in a pyarrow.Buffer. The caller is responsible for reading the returned bytes into a file, if necessary. Should be used in conjunction with `deserialize_pyarrow`. """ raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, compression=codec) as compressed: pa.serialize_to(data, compressed, context=serialization_context) return raw.getvalue()
def test_compressed_roundtrip(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) data = b"some test data\n" * 10 + b"eof\n" raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, compression) as compressed: compressed.write(data) cdata = raw.getvalue() assert len(cdata) < len(data) raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data
def test_compressed_recordbatch_stream(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) # ARROW-4836: roundtrip a RecordBatch through a compressed stream table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a']) raw = pa.BufferOutputStream() stream = pa.CompressedOutputStream(raw, compression) writer = pa.RecordBatchStreamWriter(stream, table.schema) writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() stream = pa.CompressedInputStream(pa.BufferReader(buf), compression) got_table = pa.RecordBatchStreamReader(stream).read_all() assert got_table == table
def test_compressed_roundtrip(compression): data = b"some test data\n" * 10 + b"eof\n" raw = pa.BufferOutputStream() try: with pa.CompressedOutputStream(raw, compression) as compressed: compressed.write(data) except NotImplementedError as e: if compression == "bz2": pytest.skip(str(e)) else: raise cdata = raw.getvalue() assert len(cdata) < len(data) raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data
def test_compressed_recordbatch_stream(compression): # ARROW-4836: roundtrip a RecordBatch through a compressed stream table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a']) raw = pa.BufferOutputStream() try: stream = pa.CompressedOutputStream(raw, compression) except NotImplementedError as e: if compression == "bz2": pytest.skip(str(e)) else: raise writer = pa.RecordBatchStreamWriter(stream, table.schema) writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() stream = pa.CompressedInputStream(pa.BufferReader(buf), compression) got_table = pa.RecordBatchStreamReader(stream).read_all() assert got_table == table
def write_data_as_arrow(data, schema, max_size): if isinstance(data, pyarrow.Table): data = data.to_batches(OUT_BATCH_SIZE) truncated = False buf = pyarrow.BufferOutputStream() with pyarrow.CompressedOutputStream(buf, "gzip") as sink: with pyarrow.ipc.new_file(sink, schema) as writer: for batch in data: batch_size = pyarrow.ipc.get_record_batch_size(batch) if ((max_size is not None and sink.tell() + batch_size > max_size) or # See a similar comment in GzipOutputBuffer.write(). buf.tell() + batch_size > MAX_OUT): truncated = True break writer.write(batch) return memoryview(buf.getvalue()), truncated