コード例 #1
0
ファイル: dataset.py プロジェクト: techfoxy/arrow
def dataset(paths_or_factories,
            filesystem=None,
            partitioning=None,
            format=None,
            schema=None):
    """
    Open a dataset.

    Parameters
    ----------
    paths_or_factories : path or list of paths or factory or list of factories
        Path to a file or to a directory containing the data files, or a list
        of paths for a multi-directory dataset. To have more control, a list of
        factories can be passed, created with the ``factory()`` function (in
        this case, the additional keywords will be ignored).
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str
        Currently only "parquet" is supported.
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.

    Returns
    -------
    Dataset

    Examples
    --------
    Opening a dataset for a single directory:

    >>> dataset("path/to/nyc-taxi/", format="parquet")

    Construction from multiple factories:

    >>> dataset([
    ...     factory("s3://old-taxi-data", format="parquet"),
    ...     factory("local/path/to/new/data", format="csv")
    ... ])

    """
    # bundle the keyword arguments
    kwargs = dict(filesystem=filesystem,
                  partitioning=partitioning,
                  format=format)

    single_dataset = False
    if not isinstance(paths_or_factories, list):
        paths_or_factories = [paths_or_factories]
        single_dataset = True

    factories = [_ensure_factory(f, **kwargs) for f in paths_or_factories]
    if single_dataset:
        return factories[0].finish(schema=schema)
    return UnionDatasetFactory(factories).finish(schema=schema)
コード例 #2
0
ファイル: dataset.py プロジェクト: xuechendi/arrow-1
def factory(path_or_paths, filesystem=None, partitioning=None,
            format=None):
    """
    Create a factory which can be used to build a Dataset.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning or PartitioningFactory or str or list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    format : str, default None
        Currently only "parquet" is supported.

    Returns
    -------
    FileSystemDatasetFactory
    """
    if not isinstance(path_or_paths, (list, tuple)):
        path_or_paths = [path_or_paths]

    partitioning = _ensure_partitioning(partitioning)
    format = _ensure_format(format or "parquet")

    # TODO pass through options
    options = FileSystemFactoryOptions()
    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    factories = []
    for path in path_or_paths:
        fs, paths_or_selector = _ensure_fs_and_paths(path, filesystem)
        factories.append(FileSystemDatasetFactory(fs, paths_or_selector,
                                                  format, options))

    if len(factories) == 0:
        raise ValueError("Need at least one path")
    elif len(factories) == 1:
        return factories[0]
    else:
        return UnionDatasetFactory(factories)