def _filesystem_dataset(source, schema=None, filesystem=None, partitioning=None, format=None, partition_base_dir=None, exclude_invalid_files=None, selector_ignore_prefixes=None): """ Create a FileSystemDataset which can be used to build a Dataset. Parameters are documented in the dataset function. Returns ------- FileSystemDataset """ format = _ensure_format(format or 'parquet') partitioning = _ensure_partitioning(partitioning) if isinstance(source, (list, tuple)): fs, paths_or_selector = _ensure_multiple_sources(source, filesystem) else: fs, paths_or_selector = _ensure_single_source(source, filesystem) options = FileSystemFactoryOptions( partitioning=partitioning, partition_base_dir=partition_base_dir, exclude_invalid_files=exclude_invalid_files, selector_ignore_prefixes=selector_ignore_prefixes) factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options) return factory.finish(schema)
def factory(path_or_paths, filesystem=None, partitioning=None, format=None): """ Create a factory which can be used to build a Dataset. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning or PartitioningFactory or str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str, default None Currently only "parquet" is supported. Returns ------- FileSystemDatasetFactory """ fs, paths_or_selector = _ensure_fs_and_paths(path_or_paths, filesystem) partitioning = _ensure_partitioning(partitioning) format = _ensure_format(format or "parquet") # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning return FileSystemDatasetFactory(fs, paths_or_selector, format, options)
def factory(path_or_paths, filesystem=None, partitioning=None, format=None): """ Create a factory which can be used to build a Dataset. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning or PartitioningFactory or str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str, default None Currently only "parquet" is supported. Returns ------- FileSystemDatasetFactory """ if not isinstance(path_or_paths, (list, tuple)): path_or_paths = [path_or_paths] partitioning = _ensure_partitioning(partitioning) format = _ensure_format(format or "parquet") # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning factories = [] for path in path_or_paths: fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem) factories.append(FileSystemDatasetFactory(fs, paths_or_selector, format, options)) if len(factories) == 0: raise ValueError("Need at least one path") elif len(factories) == 1: return factories[0] else: return UnionDatasetFactory(factories)