Esempio n. 1
0
def source(path_or_paths, filesystem=None, partitioning=None, format=None):
    """
    Open a (multi-file) data source.

    Parameters
    ----------
    path_or_paths : str, pathlib.Path, or list of those
        Path to a file or to a directory containing the data files, or
        a list of paths.
    filesystem : FileSystem, default None
        By default will be inferred from the path.
    partitioning : Partitioning(Factory) or str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut.
    format : str
        Currently only "parquet" is supported.

    Returns
    -------
    DataSource of DataSourceDiscovery

    """
    filesystem, paths_or_selector = _ensure_fs_and_paths(
        path_or_paths, filesystem)

    partitioning = _ensure_partitioning(partitioning)

    format = format or "parquet"
    if format == "parquet":
        format = ParquetFileFormat()
    elif not isinstance(format, FileFormat):
        raise ValueError("format '{0}' is not supported".format(format))

    # TODO pass through options
    options = FileSystemFactoryOptions()

    if isinstance(partitioning, PartitioningFactory):
        options.partitioning_factory = partitioning
    elif isinstance(partitioning, Partitioning):
        options.partitioning = partitioning

    discovery = FileSystemSourceFactory(filesystem, paths_or_selector, format,
                                        options)

    # TODO return Source if a specific schema was passed?

    # need to return SourceFactory since `dataset` might need to
    # finish the factory with a unified schema
    return discovery
Esempio n. 2
0
def _ensure_format(obj):
    if isinstance(obj, FileFormat):
        return obj
    elif obj == "parquet":
        return ParquetFileFormat()
    elif obj in {"ipc", "arrow", "feather"}:
        return IpcFileFormat()
    elif obj == "csv":
        return CsvFileFormat()
    elif obj == "orc":
        if not _orc_available:
            raise ValueError(_orc_msg)
        return OrcFileFormat()
    else:
        raise ValueError("format '{}' is not supported".format(obj))
Esempio n. 3
0
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem, _ = _ensure_filesystem(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))

    factory = ParquetDatasetFactory(metadata_path, filesystem, format)
    return factory.finish(schema)
Esempio n. 4
0
def parquet_dataset(metadata_path,
                    schema=None,
                    filesystem=None,
                    format=None,
                    partitioning=None,
                    partition_base_dir=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    partition_base_dir : str, optional
        For the purposes of applying the partitioning, paths will be
        stripped of the partition_base_dir. Files not matching the
        partition_base_dir prefix will be skipped for partitioning discovery.
        The ignored files will still be part of the Dataset, but will not
        have partition information.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem, _ensure_filesystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem = _ensure_filesystem(filesystem)

    metadata_path = filesystem.normalize_path(_stringify_path(metadata_path))
    options = ParquetFactoryOptions(
        partition_base_dir=partition_base_dir,
        partitioning=_ensure_partitioning(partitioning))

    factory = ParquetDatasetFactory(metadata_path,
                                    filesystem,
                                    format,
                                    options=options)
    return factory.finish(schema)