def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buffer_output_stream) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.12' assert orc_file.row_index_stride == 10000 assert orc_file.compression_size == 65536 # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.12' assert orc_file.row_index_stride == 10000 assert orc_file.compression_size == 65536
def save_orc_file(dataframe, filepath): """Utility function to write dataframe to disk as orc file.""" from pyarrow import Table, orc df = dataframe.copy() for c in df: if df[c].dtype.name == "category": df[c] = df[c].astype("string[pyarrow]") pa_table = Table.from_pandas(df, preserve_index=False) orc.write_table(pa_table, filepath)
def test_orc_writer_with_null_arrays(tempdir): from pyarrow import orc import pyarrow as pa path = str(tempdir / 'test.orc') a = pa.array([1, None, 3, None]) b = pa.array([None, None, None, None]) table = pa.table({"int64": a, "utf8": b}) with pytest.raises(pa.ArrowNotImplementedError): orc.write_table(table, path)
def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table)
def test_bytesio_readwrite(): from pyarrow import orc from io import BytesIO buf = BytesIO() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buf) buf.seek(0) orc_file = orc.ORCFile(buf) output_table = orc_file.read() assert table.equals(output_table)
def test_filesystem_uri(tmpdir): from pyarrow import orc table = pa.table({"a": [1, 2, 3]}) directory = tmpdir / "data_dir" directory.mkdir() path = directory / "data.orc" orc.write_table(table, str(path)) # filesystem object result = orc.read_table(path, filesystem=fs.LocalFileSystem()) assert result.equals(table) # filesystem URI result = orc.read_table("data_dir/data.orc", filesystem=util._filesystem_uri(tmpdir)) assert result.equals(table)
def test_readwrite(tmpdir): from pyarrow import orc a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) file = tmpdir.join("test.orc") orc.write_table(table, file) output_table = orc.read_table(file) assert table.equals(output_table) output_table = orc.read_table(file, []) assert 4 == output_table.num_rows assert 0 == output_table.num_columns output_table = orc.read_table(file, columns=["int64"]) assert 4 == output_table.num_rows assert 1 == output_table.num_columns
def test_orcfile_readwrite_with_writeoptions(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table( table, buffer_output_stream, compression='snappy', file_version='0.11', row_index_stride=5000, compression_block_size=32768, ) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for modified WriteOptions assert orc_file.compression == 'SNAPPY' assert orc_file.file_version == '0.11' assert orc_file.row_index_stride == 5000 assert orc_file.compression_size == 32768 # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table( buffer_output_stream, table, compression='uncompressed', file_version='0.11', row_index_stride=20000, compression_block_size=16384, ) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.11' assert orc_file.row_index_stride == 20000 assert orc_file.compression_size == 16384
def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buffer_output_stream) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table) # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table)
def write_partition(cls, df, path, fs, filename, **kwargs): table = pa.Table.from_pandas(df) with fs.open(fs.sep.join([path, filename]), "wb") as f: orc.write_table(table, f)
def test_column_selection(tempdir): from pyarrow import orc # create a table with nested types inner = pa.field('inner', pa.int64()) middle = pa.field('middle', pa.struct([inner])) fields = [ pa.field('basic', pa.int32()), pa.field('list', pa.list_(pa.field('item', pa.int32()))), pa.field('struct', pa.struct([middle, pa.field('inner2', pa.int64())])), pa.field( 'list-struct', pa.list_( pa.field( 'item', pa.struct([ pa.field('inner1', pa.int64()), pa.field('inner2', pa.int64()) ])))), pa.field('basic2', pa.int64()), ] arrs = [[0], [[1, 2]], [{ "middle": { "inner": 3 }, "inner2": 4 }], [[{ "inner1": 5, "inner2": 6 }, { "inner1": 7, "inner2": 8 }]], [9]] table = pa.table(arrs, schema=pa.schema(fields)) path = str(tempdir / 'test.orc') orc.write_table(table, path) orc_file = orc.ORCFile(path) # default selecting all columns result1 = orc_file.read() assert result1.equals(table) # selecting with columns names result2 = orc_file.read(columns=["basic", "basic2"]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=["list", "struct", "basic2"]) assert result3.equals(table.select(["list", "struct", "basic2"])) # using dotted paths result4 = orc_file.read(columns=["struct.middle.inner"]) expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]}) assert result4.equals(expected4) result5 = orc_file.read(columns=["struct.inner2"]) expected5 = pa.table({"struct": [{"inner2": 4}]}) assert result5.equals(expected5) result6 = orc_file.read( columns=["list", "struct.middle.inner", "struct.inner2"]) assert result6.equals(table.select(["list", "struct"])) result7 = orc_file.read(columns=["list-struct.inner1"]) expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]}) assert result7.equals(expected7) # selecting with (Arrow-based) field indices result2 = orc_file.read(columns=[0, 4]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=[1, 2, 3]) assert result3.equals(table.select(["list", "struct", "list-struct"])) # error on non-existing name or index with pytest.raises(IOError): # liborc returns ParseError, which gets translated into IOError # instead of ValueError orc_file.read(columns=["wrong"]) with pytest.raises(ValueError): orc_file.read(columns=[5])
def test_orcfile_readwrite_with_bad_writeoptions(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) table = pa.table({"int64": a}) # batch_size must be a positive integer with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, batch_size=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, batch_size=-100, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, batch_size=1024.23, ) # file_version must be 0.11 or 0.12 with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, file_version=0.13, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, file_version='1.1', ) # stripe_size must be a positive integer with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, stripe_size=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, stripe_size=-400, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, stripe_size=4096.73, ) # compression must be among the given options with pytest.raises(TypeError): orc.write_table( buffer_output_stream, table, compression=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression='none', ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression='zlid', ) # compression_block_size must be a positive integer with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression_block_size=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression_block_size=-200, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression_block_size=1096.73, ) # compression_strategy must be among the given options with pytest.raises(TypeError): orc.write_table( buffer_output_stream, table, compression_strategy=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression_strategy='no', ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, compression_strategy='large', ) # row_index_stride must be a positive integer with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, row_index_stride=0, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, row_index_stride=-800, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, row_index_stride=3096.29, ) # padding_tolerance must be possible to cast to float with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, padding_tolerance='cat', ) # dictionary_key_size_threshold must be possible to cast to # float between 0.0 and 1.0 with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, dictionary_key_size_threshold='arrow', ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, dictionary_key_size_threshold=1.2, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, dictionary_key_size_threshold=-3.2, ) # bloom_filter_columns must be convertible to a list containing # nonnegative integers with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_columns="string", ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_columns=[0, 1.4], ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_columns={0, 2, -1}, ) # bloom_filter_fpp must be convertible to a float between 0.0 and 1.0 with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_fpp='arrow', ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_fpp=1.1, ) with pytest.raises(ValueError): orc.write_table( buffer_output_stream, table, bloom_filter_fpp=-0.1, )
import sys import os import pandas as pd import pyarrow.parquet as pq import pyarrow.orc as orc ''' parquet_file = pq.ParquetFile(sys.argv[1]) print(parquet_file.schema) ''' orc_name = os.path.splitext(sys.argv[1])[0] + ".orc" table = pq.read_table(sys.argv[1]) print("Writing ", orc_name) orc.write_table(table, orc_name)