def test_string_partition(reader_factory, tmpdir, partition_by): """Try datasets partitioned by a string, integer and string+integer fields""" url = 'file://' + tmpdir.strpath data = create_test_scalar_dataset(url, 10, partition_by=partition_by) with reader_factory(url) as reader: row_ids_batched = [row.id for row in reader] actual_row_ids = list(itertools.chain(*row_ids_batched)) assert len(data) == len(actual_row_ids)
def test_partitioned_field_is_not_queried(reader_factory, tmpdir): """Try datasets partitioned by a string, integer and string+integer fields""" url = 'file://' + tmpdir.strpath data = create_test_scalar_dataset(url, 10, partition_by=['id']) with reader_factory(url, schema_fields=['string']) as reader: all_rows = list(reader) assert len(data) == len(all_rows) assert all_rows[0]._fields == ('string',)
def test_asymetric_parquet_pieces(reader_factory, tmpdir): """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can be fully read """ url = 'file://' + tmpdir.strpath ROWS_COUNT = 1000 # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row # groups create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700']) # We verify we have pieces with different number of row-groups dataset = pq.ParquetDataset(tmpdir.strpath) row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces) assert len(row_group_counts) > 1 # Make sure we are not missing any rows. with reader_factory(url, schema_fields=['id']) as reader: row_ids_batched = [row.id for row in reader] actual_row_ids = list(itertools.chain(*row_ids_batched)) assert ROWS_COUNT == len(actual_row_ids)
def _pure_parquet_dataset_no_cache(): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path data = create_test_scalar_dataset(url, 100) dataset = SyntheticDataset(url=url, path=path, data=data) return dataset