Beispiel #1
0
def test_feather_format(tempdir):
    from pyarrow.feather import write_feather

    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
                      'b': pa.array([.1, .2, .3], type="float64")})

    basedir = tempdir / "feather_dataset"
    basedir.mkdir()
    write_feather(table, str(basedir / "data.feather"))

    dataset = ds.dataset(basedir, format=ds.IpcFileFormat())
    result = dataset.to_table()
    assert result.equals(table)

    dataset = ds.dataset(basedir, format="feather")
    result = dataset.to_table()
    assert result.equals(table)

    # ARROW-8641 - column selection order
    result = dataset.to_table(columns=["b", "a"])
    assert result.column_names == ["b", "a"]
    result = dataset.to_table(columns=["a", "a"])
    assert result.column_names == ["a", "a"]

    # error with Feather v1 files
    write_feather(table, str(basedir / "data1.feather"), version=1)
    with pytest.raises(ValueError):
        ds.dataset(basedir, format="feather").to_table()
Beispiel #2
0
def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
            use_buffered_stream=True)),
        ds.ParquetFileFormat(read_options={
            'use_buffered_stream': True,
            'buffer_size': 4096,
        })
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format
Beispiel #3
0
def test_ipc_format(tempdir):
    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
                      'b': pa.array([.1, .2, .3], type="float64")})

    path = str(tempdir / 'test.arrow')
    with pa.output_stream(path) as sink:
        writer = pa.RecordBatchFileWriter(sink, table.schema)
        writer.write_batch(table.to_batches()[0])
        writer.close()

    dataset = ds.dataset(path, format=ds.IpcFileFormat())
    result = dataset.to_table()
    assert result.equals(table)

    dataset = ds.dataset(path, format="ipc")
    result = dataset.to_table()
    assert result.equals(table)
Beispiel #4
0
def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.CsvFileFormat(),
        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
                                             ignore_empty_lines=True)),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(
            read_options=ds.ParquetReadOptions(use_buffered_stream=True)
        ),
        ds.ParquetFileFormat(
            read_options={
                'use_buffered_stream': True,
                'buffer_size': 4096,
            }
        )
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format
Beispiel #5
0
import time

from .. import util
from .. import error as e
from .. import marshalling as m
from .. import storage as s
from . import Folder

# noinspection PyUnresolvedReferences
if False:
    from . import store

# _PARTITIONING = "hive"
# _FILE_FORMAT = pds.ParquetFileFormat()
# _FILE_FORMAT = pds.CsvFileFormat()
_FILE_FORMAT = pds.IpcFileFormat()


# todo: check pds.Expression for more operations that are supported

_OP_MAPPER = {
    '=': operator.eq,
    '==': operator.eq,
    '<': operator.lt,
    '>': operator.gt,
    '<=': operator.le,
    '>=': operator.ge,
    '!=': operator.ne,
    'and': operator.and_,
    'or': operator.or_,
    'in': lambda _x, _l: _x in _l,