Python create_test_scalar_dataset Examples

Programming Language: Python

Namespace/Package Name: petastorm.tests.test_common

Method/Function: create_test_scalar_dataset

Examples at hotexamples.com: 4

Python create_test_scalar_dataset - 4 examples found. These are the top rated real world Python examples of petastorm.tests.test_common.create_test_scalar_dataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_parquet_reader.py Project: xiaohanhuang/petastorm

def test_string_partition(reader_factory, tmpdir, partition_by):
    """Try datasets partitioned by a string, integer and string+integer fields"""
    url = 'file://' + tmpdir.strpath

    data = create_test_scalar_dataset(url, 10, partition_by=partition_by)
    with reader_factory(url) as reader:
        row_ids_batched = [row.id for row in reader]
    actual_row_ids = list(itertools.chain(*row_ids_batched))
    assert len(data) == len(actual_row_ids)

Example #2

Show file

File: test_parquet_reader.py Project: xiaohanhuang/petastorm

def test_partitioned_field_is_not_queried(reader_factory, tmpdir):
    """Try datasets partitioned by a string, integer and string+integer fields"""
    url = 'file://' + tmpdir.strpath

    data = create_test_scalar_dataset(url, 10, partition_by=['id'])
    with reader_factory(url, schema_fields=['string']) as reader:
        all_rows = list(reader)
    assert len(data) == len(all_rows)
    assert all_rows[0]._fields == ('string',)

Example #3

Show file

File: test_parquet_reader.py Project: yariv/petastorm

def test_asymetric_parquet_pieces(reader_factory, tmpdir):
    """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can
    be fully read """
    url = 'file://' + tmpdir.strpath

    ROWS_COUNT = 1000
    # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row
    # groups
    create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700'])

    # We verify we have pieces with different number of row-groups
    dataset = pq.ParquetDataset(tmpdir.strpath)
    row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces)
    assert len(row_group_counts) > 1

    # Make sure we are not missing any rows.
    with reader_factory(url, schema_fields=['id']) as reader:
        row_ids_batched = [row.id for row in reader]
        actual_row_ids = list(itertools.chain(*row_ids_batched))

    assert ROWS_COUNT == len(actual_row_ids)

Example #4

Show file

File: conftest.py Project: zhangruiskyline/petastorm

 def _pure_parquet_dataset_no_cache():
     path = tmpdir_factory.mktemp("data").strpath
     url = 'file://' + path
     data = create_test_scalar_dataset(url, 100)
     dataset = SyntheticDataset(url=url, path=path, data=data)
     return dataset