def source(path_or_paths, filesystem=None, partitioning=None, format=None): """ Open a (multi-file) data source. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning(Factory), str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str, default None Currently only "parquet" is supported. Returns ------- DataSource of DataSourceDiscovery """ fs, paths_or_selector = _ensure_fs_and_paths(path_or_paths, filesystem) partitioning = _ensure_partitioning(partitioning) format = _ensure_format(format or "parquet") # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning return FileSystemSourceFactory(fs, paths_or_selector, format, options)
def source(path_or_paths, filesystem=None, partitioning=None, format=None): """ Open a (multi-file) data source. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning(Factory), str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str Currently only "parquet" is supported. Returns ------- DataSource of DataSourceDiscovery """ filesystem, paths_or_selector = _ensure_fs_and_paths( path_or_paths, filesystem) partitioning = _ensure_partitioning(partitioning) format = format or "parquet" if format == "parquet": format = ParquetFileFormat() elif not isinstance(format, FileFormat): raise ValueError("format '{0}' is not supported".format(format)) # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning discovery = FileSystemSourceFactory(filesystem, paths_or_selector, format, options) # TODO return Source if a specific schema was passed? # need to return SourceFactory since `dataset` might need to # finish the factory with a unified schema return discovery
def factory(path_or_paths, filesystem=None, partitioning=None, format=None): """ Create a factory which can be used to build a Dataset. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning or PartitioningFactory or str or list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. format : str, default None Currently only "parquet" is supported. Returns ------- FileSystemDatasetFactory """ if not isinstance(path_or_paths, (list, tuple)): path_or_paths = [path_or_paths] partitioning = _ensure_partitioning(partitioning) format = _ensure_format(format or "parquet") # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning factories = [] for path in path_or_paths: fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem) factories.append(FileSystemDatasetFactory(fs, paths_or_selector, format, options)) if len(factories) == 0: raise ValueError("Need at least one path") elif len(factories) == 1: return factories[0] else: return UnionDatasetFactory(factories)