def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) # list of exact files needs to be passed to source() function # (dataset() will interpret it as separate sources) for dataset in [ ds.dataset(ds.source([path1, path2])), ds.dataset(ds.source([str(path1), str(path2)]))]: assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.new_scan().finish().to_table() assert result.replace_schema_metadata().equals(table)
def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) # list of exact files needs to be passed to source() function # (dataset() will interpret it as separate sources) for dataset in [ ds.dataset(ds.source([path1, path2])), ds.dataset(ds.source([str(path1), str(path2)])) ]: assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False)
def test_dataset_factory(multisourcefs): src = ds.source('/plain', filesystem=multisourcefs, format='parquet') factory = ds.DatasetFactory([src]) assert len(factory.sources) == 1 assert len(factory.inspect_schemas()) == 1 assert all(isinstance(s, ds.SourceFactory) for s in factory.sources) assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas()) assert factory.inspect_schemas()[0].equals(src.inspect()) assert factory.inspect().equals(src.inspect()) assert isinstance(factory.finish(), ds.Dataset)
def _check_dataset_from_path(path, table, **kwargs): import pathlib # pathlib object assert isinstance(path, pathlib.Path) dataset = ds.dataset(ds.source(path, **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.new_scan().finish().to_table() assert result.replace_schema_metadata().equals(table) # string path dataset = ds.dataset(ds.source(str(path), **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.new_scan().finish().to_table() assert result.replace_schema_metadata().equals(table) # passing directly to dataset dataset = ds.dataset(str(path), **kwargs) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.new_scan().finish().to_table() assert result.replace_schema_metadata().equals(table)
def _check_dataset_from_path(path, table, **kwargs): import pathlib # pathlib object assert isinstance(path, pathlib.Path) dataset = ds.dataset(ds.source(path, **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False) # string path dataset = ds.dataset(ds.source(str(path), **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False) # passing directly to dataset dataset = ds.dataset(str(path), **kwargs) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False)
def test_multiple_sources(multisourcefs): src1 = ds.source('/plain', filesystem=multisourcefs, format='parquet') src2 = ds.source('/schema', filesystem=multisourcefs, format='parquet', partitioning=['week', 'color']) src3 = ds.source('/hive', filesystem=multisourcefs, format='parquet', partitioning='hive') assembled = ds.dataset([src1, src2, src3]) assert isinstance(assembled, ds.Dataset) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('week', pa.int32()), ('month', pa.int32()), ('year', pa.int32()), ]) assert assembled.schema.equals(expected_schema, check_metadata=False)
def test_open_dataset_from_source_additional_kwargs(tempdir): _, path = _create_single_file(tempdir) with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset(ds.source(path), format="parquet")