Exemple #1
0
def _ensure_filesystem(fs_or_uri):
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, SubTreeFileSystem, FileType
    )

    if isinstance(fs_or_uri, str):
        # instantiate the file system from an uri, if the uri has a path
        # component then it will be treated as a path prefix
        filesystem, prefix = FileSystem.from_uri(fs_or_uri)
        is_local = isinstance(filesystem, LocalFileSystem)
        prefix = _normalize_path(filesystem, prefix)
        if prefix:
            # validate that the prefix is pointing to a directory
            prefix_info = filesystem.get_file_info([prefix])[0]
            if prefix_info.type != FileType.Directory:
                raise ValueError(
                    "The path component of the filesystem URI must point to a "
                    "directory but it has a type: `{}`. The path component "
                    "is `{}` and the given filesystem URI is `{}`".format(
                        prefix_info.type.name, prefix_info.path, fs_or_uri
                    )
                )
            filesystem = SubTreeFileSystem(prefix, filesystem)
        return filesystem, is_local
    elif isinstance(fs_or_uri, (LocalFileSystem, _MockFileSystem)):
        return fs_or_uri, True
    elif isinstance(fs_or_uri, FileSystem):
        return fs_or_uri, False
    else:
        raise TypeError(
            '`filesystem` argument must be a FileSystem instance or a valid '
            'file system URI'
        )
Exemple #2
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (FileSystem, LocalFileSystem, FileType,
                            _normalize_path)

    if filesystem is None:
        # First check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            local_path_exists = False
        else:
            local_path_exists = (infos.type != FileType.NotFound)

        if not local_path_exists:
            # Perhaps it's a URI?
            try:
                return FileSystem.from_uri(path)
            except ValueError as e:
                if "empty scheme" not in str(e):
                    raise
                # ARROW-8213: not a URI, assume local path
                # to get a nice error message.

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #3
0
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem, _ = _ensure_filesystem(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))

    factory = ParquetDatasetFactory(metadata_path, filesystem, format)
    return factory.finish(schema)
Exemple #4
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, FileType, _normalize_path)

    if filesystem is None:
        # first check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            return FileSystem.from_uri(path)

        if infos.type == FileType.NotFound:
            return FileSystem.from_uri(path)

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #5
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

   Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
        try:
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
        else:
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
            try:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                    raise
            else:
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_filesystem(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = _normalize_path(filesystem, path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Exemple #6
0
def _ensure_multiple_sources(paths, filesystem=None):
    """
    Treat a list of paths as files belonging to a single file system

    If the file system is local then also validates that all paths
    are referencing existing *files* otherwise any non-file paths will be
    silently skipped (for example on a remote filesystem).

    Parameters
    ----------
    paths : list of path-like
        Note that URIs are not allowed.
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    Returns
    -------
    (FileSystem, list of str)
        File system object and a list of normalized paths.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    IOError
        If the file system is local and a referenced path is not available or
        not a file.
    """
    from pyarrow.fs import LocalFileSystem, FileType

    if filesystem is None:
        # fall back to local file system as the default
        filesystem = LocalFileSystem()

    # construct a filesystem if it is a valid URI
    filesystem, is_local = _ensure_filesystem(filesystem)

    # allow normalizing irregular paths such as Windows local paths
    paths = [_normalize_path(filesystem, _stringify_path(p)) for p in paths]

    # validate that all of the paths are pointing to existing *files*
    # possible improvement is to group the file_infos by type and raise for
    # multiple paths per error category
    if is_local:
        for info in filesystem.get_file_info(paths):
            file_type = info.type
            if file_type == FileType.File:
                continue
            elif file_type == FileType.NotFound:
                raise FileNotFoundError(info.path)
            elif file_type == FileType.Directory:
                raise IsADirectoryError(
                    'Path {} points to a directory, but only file paths are '
                    'supported. To construct a nested or union dataset pass '
                    'a list of dataset objects instead.'.format(info.path)
                )
            else:
                raise IOError(
                    'Path {} exists but its type is unknown (could be a '
                    'special file such as a Unix socket or character device, '
                    'or Windows NUL / CON / ...)'.format(info.path)
                )

    return filesystem, paths
Exemple #7
0
def parquet_dataset(metadata_path,
                    schema=None,
                    filesystem=None,
                    format=None,
                    partitioning=None,
                    partition_base_dir=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    partition_base_dir : str, optional
        For the purposes of applying the partitioning, paths will be
        stripped of the partition_base_dir. Files not matching the
        partition_base_dir prefix will be skipped for partitioning discovery.
        The ignored files will still be part of the Dataset, but will not
        have partition information.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem, _ = _ensure_fs(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))
    options = ParquetFactoryOptions(
        partition_base_dir=partition_base_dir,
        partitioning=_ensure_partitioning(partitioning))

    factory = ParquetDatasetFactory(metadata_path,
                                    filesystem,
                                    format,
                                    options=options)
    return factory.finish(schema)