def _check_dataset_from_path(path, table, **kwargs): import pathlib # pathlib object assert isinstance(path, pathlib.Path) dataset = ds.dataset(ds.factory(path, **kwargs)) assert dataset.schema.equals(table.schema) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table) # string path dataset = ds.dataset(ds.factory(str(path), **kwargs)) assert dataset.schema.equals(table.schema) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table) # relative string path with change_cwd(path.parent): dataset = ds.dataset(ds.factory(path.name, **kwargs)) assert dataset.schema.equals(table.schema) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table) # passing directly to dataset dataset = ds.dataset(str(path), **kwargs) assert dataset.schema.equals(table.schema) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table)
def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) # list of exact files needs to be passed to source() function # (dataset() will interpret it as separate sources) datasets = [ ds.dataset(ds.factory([path1, path2])), ds.dataset(ds.factory([str(path1), str(path2)])) ] for dataset in datasets: assert dataset.schema.equals(table.schema) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table)
def test_dataset_factory(multisourcefs): child = ds.factory('/plain', filesystem=multisourcefs, format='parquet') factory = ds.UnionDatasetFactory([child]) # TODO(bkietz) reintroduce factory.children property assert len(factory.inspect_schemas()) == 1 assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas()) assert factory.inspect_schemas()[0].equals(child.inspect()) assert factory.inspect().equals(child.inspect()) assert isinstance(factory.finish(), ds.Dataset)
def test_multiple_factories(multisourcefs): src1 = ds.factory('/plain', filesystem=multisourcefs, format='parquet') src2 = ds.factory('/schema', filesystem=multisourcefs, format='parquet', partitioning=['week', 'color']) src3 = ds.factory('/hive', filesystem=multisourcefs, format='parquet', partitioning='hive') assembled = ds.dataset([src1, src2, src3]) assert isinstance(assembled, ds.Dataset) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('week', pa.int32()), ('year', pa.int32()), ('month', pa.int32()), ]) assert assembled.schema.equals(expected_schema)
def _check_dataset_from_path(path, table, **kwargs): import pathlib # pathlib object assert isinstance(path, pathlib.Path) dataset = ds.dataset(ds.factory(path, **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False) # string path dataset = ds.dataset(ds.factory(str(path), **kwargs)) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False) # passing directly to dataset dataset = ds.dataset(str(path), **kwargs) assert dataset.schema.equals(table.schema, check_metadata=False) result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table, check_metadata=False)
def _check_dataset_from_path(path, table, **kwargs): import pathlib # pathlib object assert isinstance(path, pathlib.Path) dataset = ds.dataset(ds.factory(path, **kwargs)) assert isinstance(dataset, ds.FileSystemDataset) _check_dataset(dataset, table) # string path dataset = ds.dataset(ds.factory(str(path), **kwargs)) assert isinstance(dataset, ds.FileSystemDataset) _check_dataset(dataset, table) # relative string path with change_cwd(path.parent): dataset = ds.dataset(ds.factory(path.name, **kwargs)) assert isinstance(dataset, ds.FileSystemDataset) _check_dataset(dataset, table) # passing directly to dataset dataset = ds.dataset(path, **kwargs) assert isinstance(dataset, ds.FileSystemDataset) _check_dataset(dataset, table) dataset = ds.dataset(str(path), **kwargs) assert isinstance(dataset, ds.FileSystemDataset) _check_dataset(dataset, table) # passing list of files (even of length-1) gives UnionDataset dataset = ds.dataset([path], **kwargs) assert isinstance(dataset, ds.UnionDataset) _check_dataset(dataset, table) dataset = ds.dataset([str(path)], **kwargs) assert isinstance(dataset, ds.UnionDataset) _check_dataset(dataset, table)