Beispiel #1
0
def test_compressed_input_invalid():
    data = b"foo" * 10
    raw = pa.BufferReader(data)
    with pytest.raises(ValueError):
        pa.CompressedInputStream(raw, "unknown_compression")
    with pytest.raises(TypeError):
        pa.CompressedInputStream(raw, None)

    with pa.CompressedInputStream(raw, "gzip") as compressed:
        with pytest.raises(IOError, match="zlib inflate failed"):
            compressed.read()
Beispiel #2
0
def test_compress_stream(batch):
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, "lz4") as compressed:
        pa.serialize(batch).write_to(compressed)
    cdata = raw.getvalue()
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, "lz4") as compressed:
        tmp = pa.deserialize(compressed.read())
Beispiel #3
0
def check_compressed_input(data, fn, compression):
    raw = pa.OSFile(fn, mode="rb")
    with pa.CompressedInputStream(raw, compression) as compressed:
        assert not compressed.closed
        assert compressed.readable()
        assert not compressed.writable()
        assert not compressed.seekable()
        got = compressed.read()
        assert got == data
    assert compressed.closed
    assert raw.closed

    # Same with read_buffer()
    raw = pa.OSFile(fn, mode="rb")
    with pa.CompressedInputStream(raw, compression) as compressed:
        buf = compressed.read_buffer()
        assert isinstance(buf, pa.Buffer)
        assert buf.to_pybytes() == data
Beispiel #4
0
def urlopen(url: str, *, compression: str, seekable: bool = False):
    if compression == "gz":
        compression = "gzip"
    # urllib's urlopen() works faster than fsspec, but is not seekable.
    if seekable:
        return fsspec.open(url, compression=compression).open()
    fileobj = urllib.request.urlopen(url)  # pylint: disable=consider-using-with
    if compression is not None:
        fileobj = pyarrow.CompressedInputStream(fileobj, compression)
    return fileobj
def deserialize_pyarrow(data: bytes, codec: str):
    """
    Deserialize and decompress an object with a specific codec. The caller is
    responsible for unmarshalling the results, if neccessary.

    Should be used in conjunction with `serialize_pyarrow`.
    """
    reader = pa.BufferReader(data)
    with pa.CompressedInputStream(reader, compression=codec) as compressed:
        deserialized = pa.deserialize(compressed.read(),
                                      context=serialization_context)
    return deserialized
Beispiel #6
0
def test_compressed_roundtrip(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    data = b"some test data\n" * 10 + b"eof\n"
    raw = pa.BufferOutputStream()
    with pa.CompressedOutputStream(raw, compression) as compressed:
        compressed.write(data)

    cdata = raw.getvalue()
    assert len(cdata) < len(data)
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data
Beispiel #7
0
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [
            name for name, dtype in arrow_fields if "timestamp" in dtype
        ]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file,
                                          schema=arrow_schema,
                                          compression='snappy',
                                          version="2.0",
                                          use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(
            reader,
            header=None,
            names=column_names,
            dtype=dict(pandas_fields),
            true_values=map_to_bytes('T'),
            false_values=map_to_bytes('F'),
            chunksize=BUFFER_SIZE_ROWS,
            parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols)
Beispiel #8
0
def test_compressed_recordbatch_stream(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    # ARROW-4836: roundtrip a RecordBatch through a compressed stream
    table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
    raw = pa.BufferOutputStream()
    stream = pa.CompressedOutputStream(raw, compression)
    writer = pa.RecordBatchStreamWriter(stream, table.schema)
    writer.write_table(table, max_chunksize=3)
    writer.close()
    stream.close()  # Flush data
    buf = raw.getvalue()
    stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
    got_table = pa.RecordBatchStreamReader(stream).read_all()
    assert got_table == table
Beispiel #9
0
def test_compressed_roundtrip(compression):
    data = b"some test data\n" * 10 + b"eof\n"
    raw = pa.BufferOutputStream()
    try:
        with pa.CompressedOutputStream(raw, compression) as compressed:
            compressed.write(data)
    except NotImplementedError as e:
        if compression == "bz2":
            pytest.skip(str(e))
        else:
            raise
    cdata = raw.getvalue()
    assert len(cdata) < len(data)
    raw = pa.BufferReader(cdata)
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data
Beispiel #10
0
def test_compressed_recordbatch_stream(compression):
    # ARROW-4836: roundtrip a RecordBatch through a compressed stream
    table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
    raw = pa.BufferOutputStream()
    try:
        stream = pa.CompressedOutputStream(raw, compression)
    except NotImplementedError as e:
        if compression == "bz2":
            pytest.skip(str(e))
        else:
            raise
    writer = pa.RecordBatchStreamWriter(stream, table.schema)
    writer.write_table(table, max_chunksize=3)
    writer.close()
    stream.close()  # Flush data
    buf = raw.getvalue()
    stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
    got_table = pa.RecordBatchStreamReader(stream).read_all()
    assert got_table == table
Beispiel #11
0
def check_compressed_concatenated(data, fn, compression):
    raw = pa.OSFile(fn, mode="rb")
    with pa.CompressedInputStream(raw, compression) as compressed:
        got = compressed.read()
        assert got == data