def dataset(paths_or_factories, filesystem=None, partitioning=None, format=None, schema=None): """ Open a dataset. Parameters ---------- paths_or_factories : path or list of paths or factory or list of factories Path to a file or to a directory containing the data files, or a list of paths for a multi-directory dataset. To have more control, a list of factories can be passed, created with the ``factory()`` function (in this case, the additional keywords will be ignored). filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str Currently only "parquet" is supported. schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. Returns ------- Dataset Examples -------- Opening a dataset for a single directory: >>> dataset("path/to/nyc-taxi/", format="parquet") Construction from multiple factories: >>> dataset([ ... factory("s3://old-taxi-data", format="parquet"), ... factory("local/path/to/new/data", format="csv") ... ]) """ # bundle the keyword arguments kwargs = dict(filesystem=filesystem, partitioning=partitioning, format=format) single_dataset = False if not isinstance(paths_or_factories, list): paths_or_factories = [paths_or_factories] single_dataset = True factories = [_ensure_factory(f, **kwargs) for f in paths_or_factories] if single_dataset: return factories[0].finish(schema=schema) return UnionDatasetFactory(factories).finish(schema=schema)
def factory(path_or_paths, filesystem=None, partitioning=None, format=None): """ Create a factory which can be used to build a Dataset. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning or PartitioningFactory or str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str, default None Currently only "parquet" is supported. Returns ------- FileSystemDatasetFactory """ if not isinstance(path_or_paths, (list, tuple)): path_or_paths = [path_or_paths] partitioning = _ensure_partitioning(partitioning) format = _ensure_format(format or "parquet") # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning factories = [] for path in path_or_paths: fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem) factories.append(FileSystemDatasetFactory(fs, paths_or_selector, format, options)) if len(factories) == 0: raise ValueError("Need at least one path") elif len(factories) == 1: return factories[0] else: return UnionDatasetFactory(factories)