Esempio n. 1
0
def test_output_stream_constructor(tmpdir):
    if not Codec.is_available("gzip"):
        pytest.skip("gzip support is not built")
    with pa.CompressedOutputStream(tmpdir / "ctor.gz", "gzip") as stream:
        stream.write(b"test")
    with (tmpdir / "ctor2.gz").open("wb") as f:
        with pa.CompressedOutputStream(f, "gzip") as stream:
            stream.write(b"test")
Esempio n. 2
0
def test_compress_stream(batch):
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, "lz4") as compressed:
        pa.serialize(batch).write_to(compressed)
    cdata = raw.getvalue()
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, "lz4") as compressed:
        tmp = pa.deserialize(compressed.read())
Esempio n. 3
0
def make_compressed_output(data, fn, compression):
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, compression) as compressed:
        assert not compressed.closed
        assert not compressed.readable()
        assert compressed.writable()
        assert not compressed.seekable()
        compressed.write(data)
    assert compressed.closed
    assert raw.closed
    with open(fn, "wb") as f:
        f.write(raw.getvalue())
def serialize_pyarrow(data: bytes, codec: str):
    """
    Serialize an object and compress with a specific codec. Returns the
    serialized, compressed bytes in a pyarrow.Buffer. The caller is
    responsible for reading the returned bytes into a file, if necessary.

    Should be used in conjunction with `deserialize_pyarrow`.
    """
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, compression=codec) as compressed:
        pa.serialize_to(data, compressed, context=serialization_context)
    return raw.getvalue()
Esempio n. 5
0
def test_compressed_roundtrip(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    data = b"some test data\n" * 10 + b"eof\n"
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, compression) as compressed:
        compressed.write(data)

    cdata = raw.getvalue()
    assert len(cdata) < len(data)
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data
Esempio n. 6
0
def test_compressed_recordbatch_stream(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    # ARROW-4836: roundtrip a RecordBatch through a compressed stream
    table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
    raw = pa.BufferOutputStream()
    stream = pa.CompressedOutputStream(raw, compression)
    writer = pa.RecordBatchStreamWriter(stream, table.schema)
    writer.write_table(table, max_chunksize=3)
    writer.close()
    stream.close()  # Flush data
    buf = raw.getvalue()
    stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
    got_table = pa.RecordBatchStreamReader(stream).read_all()
    assert got_table == table
Esempio n. 7
0
def test_compressed_roundtrip(compression):
    data = b"some test data\n" * 10 + b"eof\n"
    raw = pa.BufferOutputStream()
    try:
        with pa.CompressedOutputStream(raw, compression) as compressed:
            compressed.write(data)
    except NotImplementedError as e:
        if compression == "bz2":
            pytest.skip(str(e))
        else:
            raise
    cdata = raw.getvalue()
    assert len(cdata) < len(data)
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data
Esempio n. 8
0
def test_compressed_recordbatch_stream(compression):
    # ARROW-4836: roundtrip a RecordBatch through a compressed stream
    table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
    raw = pa.BufferOutputStream()
    try:
        stream = pa.CompressedOutputStream(raw, compression)
    except NotImplementedError as e:
        if compression == "bz2":
            pytest.skip(str(e))
        else:
            raise
    writer = pa.RecordBatchStreamWriter(stream, table.schema)
    writer.write_table(table, max_chunksize=3)
    writer.close()
    stream.close()  # Flush data
    buf = raw.getvalue()
    stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
    got_table = pa.RecordBatchStreamReader(stream).read_all()
    assert got_table == table
Esempio n. 9
0
def write_data_as_arrow(data, schema, max_size):
    if isinstance(data, pyarrow.Table):
        data = data.to_batches(OUT_BATCH_SIZE)

    truncated = False
    buf = pyarrow.BufferOutputStream()
    with pyarrow.CompressedOutputStream(buf, "gzip") as sink:
        with pyarrow.ipc.new_file(sink, schema) as writer:
            for batch in data:
                batch_size = pyarrow.ipc.get_record_batch_size(batch)
                if ((max_size is not None
                     and sink.tell() + batch_size > max_size) or
                        # See a similar comment in GzipOutputBuffer.write().
                        buf.tell() + batch_size > MAX_OUT):
                    truncated = True
                    break
                writer.write(batch)

    return memoryview(buf.getvalue()), truncated