def test_feather_format(tempdir): from pyarrow.feather import write_feather table = pa.table({'a': pa.array([1, 2, 3], type="int8"), 'b': pa.array([.1, .2, .3], type="float64")}) basedir = tempdir / "feather_dataset" basedir.mkdir() write_feather(table, str(basedir / "data.feather")) dataset = ds.dataset(basedir, format=ds.IpcFileFormat()) result = dataset.to_table() assert result.equals(table) dataset = ds.dataset(basedir, format="feather") result = dataset.to_table() assert result.equals(table) # ARROW-8641 - column selection order result = dataset.to_table(columns=["b", "a"]) assert result.column_names == ["b", "a"] result = dataset.to_table(columns=["a", "a"]) assert result.column_names == ["a", "a"] # error with Feather v1 files write_feather(table, str(basedir / "data1.feather"), version=1) with pytest.raises(ValueError): ds.dataset(basedir, format="feather").to_table()
def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.ParquetFileFormat(), ds.ParquetFileFormat(read_options=ds.ParquetReadOptions( use_buffered_stream=True)), ds.ParquetFileFormat(read_options={ 'use_buffered_stream': True, 'buffer_size': 4096, }) ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format
def test_ipc_format(tempdir): table = pa.table({'a': pa.array([1, 2, 3], type="int8"), 'b': pa.array([.1, .2, .3], type="float64")}) path = str(tempdir / 'test.arrow') with pa.output_stream(path) as sink: writer = pa.RecordBatchFileWriter(sink, table.schema) writer.write_batch(table.to_batches()[0]) writer.close() dataset = ds.dataset(path, format=ds.IpcFileFormat()) result = dataset.to_table() assert result.equals(table) dataset = ds.dataset(path, format="ipc") result = dataset.to_table() assert result.equals(table)
def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t', ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) ), ds.ParquetFileFormat( read_options={ 'use_buffered_stream': True, 'buffer_size': 4096, } ) ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format
import time from .. import util from .. import error as e from .. import marshalling as m from .. import storage as s from . import Folder # noinspection PyUnresolvedReferences if False: from . import store # _PARTITIONING = "hive" # _FILE_FORMAT = pds.ParquetFileFormat() # _FILE_FORMAT = pds.CsvFileFormat() _FILE_FORMAT = pds.IpcFileFormat() # todo: check pds.Expression for more operations that are supported _OP_MAPPER = { '=': operator.eq, '==': operator.eq, '<': operator.lt, '>': operator.gt, '<=': operator.le, '>=': operator.ge, '!=': operator.ne, 'and': operator.and_, 'or': operator.or_, 'in': lambda _x, _l: _x in _l,