def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/schema'), format=ds.ParquetFileFormat() ) with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): ds.dataset([child1, child2]) expected = ( "Expected a list of path-like or dataset objects. The given list " "contains the following types: int" ) with pytest.raises(TypeError, match=expected): ds.dataset([1, 2, 3]) expected = ( "Expected a path-like, list of path-likes or a list of Datasets " "instead of the given type: NoneType" ) with pytest.raises(TypeError, match=expected): ds.dataset(None)
def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.ParquetFileFormat(), ds.ParquetFileFormat(read_options=ds.ParquetReadOptions( use_buffered_stream=True)), ds.ParquetFileFormat(read_options={ 'use_buffered_stream': True, 'buffer_size': 4096, }) ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format
def test_filesystem_data_source(mockfs): file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] source = ds.FileSystemDataSource(mockfs, paths, partitions, source_partition=None, file_format=file_format) source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('source'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] source = ds.FileSystemDataSource(mockfs, paths, partitions, source_partition=source_partition, file_format=file_format) assert source.partition_expression.equals(source_partition)
def test_partitioning_factory(mockfs): paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths expected_schema = pa.schema([ ("i64", pa.int64()), ("f64", pa.float64()), ("str", pa.string()), ("group", pa.int32()), ("key", pa.string()), ]) assert inspected_schema.equals(expected_schema) hive_partitioning_factory = ds.HivePartitioning.discover() assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
def test_open_dataset_from_source_additional_kwargs(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset(child, format="parquet")
def test_filesystem_source(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] source = ds.FileSystemSource(schema, source_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('source'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] source = ds.FileSystemSource(paths_or_selector=paths, schema=schema, source_partition=source_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert source.partition_expression.equals(source_partition)
def test_filesystem_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions( dictionary_columns={"str"})) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals(pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()), pa.field('str', pa.dictionary(pa.int32(), pa.string())), pa.field('const', pa.int64()), pa.field('group', pa.int32()), pa.field('key', pa.string()), ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.ScalarExpression(True)) dataset = factory.finish() assert isinstance(dataset, ds.FileSystemDataset) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), pa.array("0 1 2 3 4".split(), type=pa.string())) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) for batch in task.execute(): assert batch.num_columns == 6 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_str) assert batch[3].equals(expected_const) assert batch[4].equals(expected_group) assert batch[5].equals(expected_key) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 6
def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish()
def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t', ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) ), ds.ParquetFileFormat( read_options={ 'use_buffered_stream': True, 'buffer_size': 4096, } ) ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format
def test_filesystem_dataset(mockfs): schema = pa.schema([pa.field('const', pa.int64())]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset(schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] dataset = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals( ds.AndExpression(root_partition, partition)) assert fragment.path == path assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} # test predicate pushdown using row group metadata fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2 assert len(list(fragments[0].get_row_group_fragments())) == 1 assert len(list(fragments[1].get_row_group_fragments())) == 0
def test_filesystem_dataset(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset( schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337) ) partitions = [ ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2) ) ] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) assert fragments[0].partition_expression.equals( ds.AndExpression(root_partition, partitions[0])) assert fragments[1].partition_expression.equals( ds.AndExpression(root_partition, partitions[1])) assert fragments[0].path == paths[0] assert fragments[1].path == paths[1]
def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemDiscoveryOptions('subdir') options.partition_scheme = ds.SchemaPartitionScheme( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format, options) schema = discovery.inspect() source = discovery.finish() return ds.Dataset([source], schema)
def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) # TODO(bkietz) reintroduce factory.children property assert len(factory.inspect_schemas()) == 1 assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas()) assert factory.inspect_schemas()[0].equals(child.inspect()) assert factory.inspect().equals(child.inspect()) assert isinstance(factory.finish(), ds.Dataset)
def test_file_system_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int32()), pa.field('key', pa.string()) ]) ) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is True factory = ds.FileSystemSourceFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() assert isinstance(factory.inspect(), pa.Schema) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemSource) assert factory.root_partition.equals(ds.ScalarExpression(True)) source = factory.finish() assert isinstance(source, ds.Source) dataset = ds.Dataset([source], inspected_schema) scanner = dataset.new_scan().finish() assert len(list(scanner.scan())) == 2 expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group_column = pa.array([group] * 5, type=pa.int32()) expected_key_column = pa.array([key] * 5, type=pa.string()) for batch in task.execute(): assert batch.num_columns == 4 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_group_column) assert batch[3].equals(expected_key_column) table = scanner.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 4
def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() dataset = ds.dataset('/plain', filesystem=multisourcefs, format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) row_group_fragment = parquet_format.make_fragment(path, multisourcefs, row_groups=[0]) for f in [fragment, row_group_fragment]: assert isinstance(f, ds.ParquetFileFragment) assert f.path == path assert isinstance(f.filesystem, type(multisourcefs)) assert fragment.row_groups is None assert row_group_fragment.row_groups == {0}
def _read_data(self) -> None: _logger.debug("Starting data read") # only scan the columns projected and in our file cols_to_read = prune_columns(self._file_schema, self._expected_schema) with profile("read data", self._stats): try: read_fs = FS_MAP[type(self._input.fs)] except KeyError: raise FileSystemNotFound( f"No mapped filesystem found for {type(self._input.fs)}") arrow_dataset = ds.FileSystemDataset.from_paths( [self._input.location()], schema=self._arrow_file.schema_arrow, format=ds.ParquetFileFormat(), filesystem=read_fs()) arrow_table = arrow_dataset.to_table(columns=cols_to_read, filter=self._filter) # process schema evolution if needed with profile("schema_evol_proc", self._stats): processed_tbl = self.migrate_schema(arrow_table) for i, field in self.get_missing_fields(): dtype_func = DTYPE_MAP.get(field.type.type_id) if dtype_func is None: raise RuntimeError( "Unable to create null column for type %s" % field.type.type_id) dtype = dtype_func(field) processed_tbl = (processed_tbl.add_column( i, pa.field(field.name, dtype[0], True, None), ParquetReader.create_null_column(processed_tbl[0], dtype))) self._table = processed_tbl self.materialized_table = True
def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.scalar(True), ds.scalar(True)] dataset = ds.FileSystemDataset( schema=schema, format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) # the root_partition and partitions keywords have defaults dataset = ds.FileSystemDataset( paths, schema, format=file_format, filesystem=mockfs, ) assert isinstance(dataset.format, ds.ParquetFileFormat) # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format, filesystem=mockfs, root_partition=1) root_partition = ds.field('level') == ds.scalar(1337) partitions = [ds.field('part') == x for x in range(1, 3)] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path assert isinstance(fragment.format, ds.ParquetFileFormat) assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2