Python ParquetReadOptions Examples

Programming Language: Python

Namespace/Package Name: pyarrow.dataset

Method/Function: ParquetReadOptions

Examples at hotexamples.com: 4

Python ParquetReadOptions - 4 examples found. These are the top rated real world Python examples of pyarrow.dataset.ParquetReadOptions extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
        dictionary_columns={"str"}))

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format,
                                          options)
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]),
                                    check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string()))
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6

Example #2

Show file

def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
            use_buffered_stream=True)),
        ds.ParquetFileFormat(read_options={
            'use_buffered_stream': True,
            'buffer_size': 4096,
        })
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format

Example #3

Show file

File: test_dataset.py Project: xiepeini/arrow

def test_parquet_read_options():
    opts1 = ds.ParquetReadOptions()
    opts2 = ds.ParquetReadOptions(buffer_size=4096,
                                  dictionary_columns=['a', 'b'])
    opts3 = ds.ParquetReadOptions(buffer_size=2**13, use_buffered_stream=True,
                                  dictionary_columns={'a', 'b'})

    assert opts1.use_buffered_stream is False
    assert opts1.buffer_size == 2**13
    assert opts1.dictionary_columns == set()

    assert opts2.use_buffered_stream is False
    assert opts2.buffer_size == 2**12
    assert opts2.dictionary_columns == {'a', 'b'}

    assert opts3.use_buffered_stream is True
    assert opts3.buffer_size == 2**13
    assert opts3.dictionary_columns == {'a', 'b'}

    assert opts1 == opts1
    assert opts1 != opts2
    assert opts2 != opts3

Example #4

Show file

File: test_dataset.py Project: xiepeini/arrow

def test_file_format_pickling():
    formats = [
        ds.IpcFileFormat(),
        ds.CsvFileFormat(),
        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
                                             ignore_empty_lines=True)),
        ds.ParquetFileFormat(),
        ds.ParquetFileFormat(
            read_options=ds.ParquetReadOptions(use_buffered_stream=True)
        ),
        ds.ParquetFileFormat(
            read_options={
                'use_buffered_stream': True,
                'buffer_size': 4096,
            }
        )
    ]
    for file_format in formats:
        assert pickle.loads(pickle.dumps(file_format)) == file_format