def test_compressed_input_invalid(): data = b"foo" * 10 raw = pa.BufferReader(data) with pytest.raises(ValueError): pa.CompressedInputStream(raw, "unknown_compression") with pytest.raises(TypeError): pa.CompressedInputStream(raw, None) with pa.CompressedInputStream(raw, "gzip") as compressed: with pytest.raises(IOError, match="zlib inflate failed"): compressed.read()
def test_compress_stream(batch): raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, "lz4") as compressed: pa.serialize(batch).write_to(compressed) cdata = raw.getvalue() raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, "lz4") as compressed: tmp = pa.deserialize(compressed.read())
def check_compressed_input(data, fn, compression): raw = pa.OSFile(fn, mode="rb") with pa.CompressedInputStream(raw, compression) as compressed: assert not compressed.closed assert compressed.readable() assert not compressed.writable() assert not compressed.seekable() got = compressed.read() assert got == data assert compressed.closed assert raw.closed # Same with read_buffer() raw = pa.OSFile(fn, mode="rb") with pa.CompressedInputStream(raw, compression) as compressed: buf = compressed.read_buffer() assert isinstance(buf, pa.Buffer) assert buf.to_pybytes() == data
def urlopen(url: str, *, compression: str, seekable: bool = False): if compression == "gz": compression = "gzip" # urllib's urlopen() works faster than fsspec, but is not seekable. if seekable: return fsspec.open(url, compression=compression).open() fileobj = urllib.request.urlopen(url) # pylint: disable=consider-using-with if compression is not None: fileobj = pyarrow.CompressedInputStream(fileobj, compression) return fileobj
def deserialize_pyarrow(data: bytes, codec: str): """ Deserialize and decompress an object with a specific codec. The caller is responsible for unmarshalling the results, if neccessary. Should be used in conjunction with `serialize_pyarrow`. """ reader = pa.BufferReader(data) with pa.CompressedInputStream(reader, compression=codec) as compressed: deserialized = pa.deserialize(compressed.read(), context=serialization_context) return deserialized
def test_compressed_roundtrip(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) data = b"some test data\n" * 10 + b"eof\n" raw = pa.BufferOutputStream() with pa.CompressedOutputStream(raw, compression) as compressed: compressed.write(data) cdata = raw.getvalue() assert len(cdata) < len(data) raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") pandas_fields = get_pandas_fields(table) arrow_fields = get_arrow_fields(table) arrow_schema = pa.schema(get_arrow_fields(table)) column_names = [name for name, dtype in pandas_fields] date_cols = [ name for name, dtype in arrow_fields if "timestamp" in dtype ] # Using both Arrow and Pandas allows each library to cover the other's current shortcomings. # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes. # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet. in_buf = pa.OSFile(str(extract_file), mode="r") reader = pa.CompressedInputStream(in_buf, compression="zstd") # Have to use snappy codec for Parquet because Drill doesn't read zstd parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy', version="2.0", use_dictionary=True) df_iterator: TextFileReader = pd.read_csv( reader, header=None, names=column_names, dtype=dict(pandas_fields), true_values=map_to_bytes('T'), false_values=map_to_bytes('F'), chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols) chunked_write(df_iterator, parquet_writer, date_cols)
def test_compressed_recordbatch_stream(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) # ARROW-4836: roundtrip a RecordBatch through a compressed stream table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a']) raw = pa.BufferOutputStream() stream = pa.CompressedOutputStream(raw, compression) writer = pa.RecordBatchStreamWriter(stream, table.schema) writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() stream = pa.CompressedInputStream(pa.BufferReader(buf), compression) got_table = pa.RecordBatchStreamReader(stream).read_all() assert got_table == table
def test_compressed_roundtrip(compression): data = b"some test data\n" * 10 + b"eof\n" raw = pa.BufferOutputStream() try: with pa.CompressedOutputStream(raw, compression) as compressed: compressed.write(data) except NotImplementedError as e: if compression == "bz2": pytest.skip(str(e)) else: raise cdata = raw.getvalue() assert len(cdata) < len(data) raw = pa.BufferReader(cdata) with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data
def test_compressed_recordbatch_stream(compression): # ARROW-4836: roundtrip a RecordBatch through a compressed stream table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a']) raw = pa.BufferOutputStream() try: stream = pa.CompressedOutputStream(raw, compression) except NotImplementedError as e: if compression == "bz2": pytest.skip(str(e)) else: raise writer = pa.RecordBatchStreamWriter(stream, table.schema) writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() stream = pa.CompressedInputStream(pa.BufferReader(buf), compression) got_table = pa.RecordBatchStreamReader(stream).read_all() assert got_table == table
def check_compressed_concatenated(data, fn, compression): raw = pa.OSFile(fn, mode="rb") with pa.CompressedInputStream(raw, compression) as compressed: got = compressed.read() assert got == data