コード例 #1
0
ファイル: dataset.py プロジェクト: astrojams1/cleanstreets
def _filesystem_dataset(source,
                        schema=None,
                        filesystem=None,
                        partitioning=None,
                        format=None,
                        partition_base_dir=None,
                        exclude_invalid_files=None,
                        selector_ignore_prefixes=None):
    """
    Create a FileSystemDataset which can be used to build a Dataset.

    Parameters are documented in the dataset function.

    Returns
    -------
    FileSystemDataset
    """
    format = _ensure_format(format or 'parquet')
    partitioning = _ensure_partitioning(partitioning)

    if isinstance(source, (list, tuple)):
        fs, paths_or_selector = _ensure_multiple_sources(source, filesystem)
    else:
        fs, paths_or_selector = _ensure_single_source(source, filesystem)

    options = FileSystemFactoryOptions(
        partitioning=partitioning,
        partition_base_dir=partition_base_dir,
        exclude_invalid_files=exclude_invalid_files,
        selector_ignore_prefixes=selector_ignore_prefixes)
    factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)

    return factory.finish(schema)
コード例 #2
0
ファイル: dataset.py プロジェクト: marc9595/arrow
def factory(path_or_paths, filesystem=None, partitioning=None,
            format=None):
    """
    Create a factory which can be used to build a Dataset.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning or PartitioningFactory or str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str, default None
        Currently only "parquet" is supported.

    Returns
    -------
    FileSystemDatasetFactory
    """
    fs, paths_or_selector = _ensure_fs_and_paths(path_or_paths, filesystem)
    partitioning = _ensure_partitioning(partitioning)
    format = _ensure_format(format or "parquet")

    # TODO pass through options
    options = FileSystemFactoryOptions()
    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    return FileSystemDatasetFactory(fs, paths_or_selector, format, options)
コード例 #3
0
ファイル: dataset.py プロジェクト: xuechendi/arrow-1
def factory(path_or_paths, filesystem=None, partitioning=None,
            format=None):
    """
    Create a factory which can be used to build a Dataset.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning or PartitioningFactory or str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str, default None
        Currently only "parquet" is supported.

    Returns
    -------
    FileSystemDatasetFactory
    """
    if not isinstance(path_or_paths, (list, tuple)):
        path_or_paths = [path_or_paths]

    partitioning = _ensure_partitioning(partitioning)
    format = _ensure_format(format or "parquet")

    # TODO pass through options
    options = FileSystemFactoryOptions()
    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    factories = []
    for path in path_or_paths:
        fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem)
        factories.append(FileSystemDatasetFactory(fs, paths_or_selector,
                                                  format, options))

    if len(factories) == 0:
        raise ValueError("Need at least one path")
    elif len(factories) == 1:
        return factories[0]
    else:
        return UnionDatasetFactory(factories)