Exemple #1
def _ensure_filesystem(fs_or_uri):
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, SubTreeFileSystem, FileType

    if isinstance(fs_or_uri, str):
        # instantiate the file system from an uri, if the uri has a path
        # component then it will be treated as a path prefix
        filesystem, prefix = FileSystem.from_uri(fs_or_uri)
        is_local = isinstance(filesystem, LocalFileSystem)
        prefix = _normalize_path(filesystem, prefix)
        if prefix:
            # validate that the prefix is pointing to a directory
            prefix_info = filesystem.get_file_info([prefix])[0]
            if prefix_info.type != FileType.Directory:
                raise ValueError(
                    "The path component of the filesystem URI must point to a "
                    "directory but it has a type: `{}`. The path component "
                    "is `{}` and the given filesystem URI is `{}`".format(
                        prefix_info.type.name, prefix_info.path, fs_or_uri
            filesystem = SubTreeFileSystem(prefix, filesystem)
        return filesystem, is_local
    elif isinstance(fs_or_uri, (LocalFileSystem, _MockFileSystem)):
        return fs_or_uri, True
    elif isinstance(fs_or_uri, FileSystem):
        return fs_or_uri, False
        raise TypeError(
            '`filesystem` argument must be a FileSystem instance or a valid '
            'file system URI'
Exemple #2
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (FileSystem, LocalFileSystem, FileType,

    if filesystem is None:
        # First check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            local_path_exists = False
            local_path_exists = (infos.type != FileType.NotFound)

        if not local_path_exists:
            # Perhaps it's a URI?
                return FileSystem.from_uri(path)
            except ValueError as e:
                if "empty scheme" not in str(e):
                # ARROW-8213: not a URI, assume local path
                # to get a nice error message.

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #3
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None):
    Create a FileSystemDataset from a `_metadata` file created via

    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be

    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
        filesystem, _ = _ensure_filesystem(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))

    factory = ParquetDatasetFactory(metadata_path, filesystem, format)
    return factory.finish(schema)
Exemple #4
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, FileType, _normalize_path)

    if filesystem is None:
        # first check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            return FileSystem.from_uri(path)

        if infos.type == FileType.NotFound:
            return FileSystem.from_uri(path)

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #5
def _ensure_single_source(path, filesystem=None):
    Treat path as either a recursively traversable directory or a single file.

    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

        If the passed filesystem has wrong type.
        If the referenced file or directory doesn't exist.
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_filesystem(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = _normalize_path(filesystem, path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Exemple #6
def _ensure_multiple_sources(paths, filesystem=None):
    Treat a list of paths as files belonging to a single file system

    If the file system is local then also validates that all paths
    are referencing existing *files* otherwise any non-file paths will be
    silently skipped (for example on a remote filesystem).

    paths : list of path-like
        Note that URIs are not allowed.
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    (FileSystem, list of str)
        File system object and a list of normalized paths.

        If the passed filesystem has wrong type.
        If the file system is local and a referenced path is not available or
        not a file.
    from pyarrow.fs import LocalFileSystem, FileType

    if filesystem is None:
        # fall back to local file system as the default
        filesystem = LocalFileSystem()

    # construct a filesystem if it is a valid URI
    filesystem, is_local = _ensure_filesystem(filesystem)

    # allow normalizing irregular paths such as Windows local paths
    paths = [_normalize_path(filesystem, _stringify_path(p)) for p in paths]

    # validate that all of the paths are pointing to existing *files*
    # possible improvement is to group the file_infos by type and raise for
    # multiple paths per error category
    if is_local:
        for info in filesystem.get_file_info(paths):
            file_type = info.type
            if file_type == FileType.File:
            elif file_type == FileType.NotFound:
                raise FileNotFoundError(info.path)
            elif file_type == FileType.Directory:
                raise IsADirectoryError(
                    'Path {} points to a directory, but only file paths are '
                    'supported. To construct a nested or union dataset pass '
                    'a list of dataset objects instead.'.format(info.path)
                raise IOError(
                    'Path {} exists but its type is unknown (could be a '
                    'special file such as a Unix socket or character device, '
                    'or Windows NUL / CON / ...)'.format(info.path)

    return filesystem, paths
Exemple #7
def parquet_dataset(metadata_path,
    Create a FileSystemDataset from a `_metadata` file created via

    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    partition_base_dir : str, optional
        For the purposes of applying the partitioning, paths will be
        stripped of the partition_base_dir. Files not matching the
        partition_base_dir prefix will be skipped for partitioning discovery.
        The ignored files will still be part of the Dataset, but will not
        have partition information.

    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
        filesystem, _ = _ensure_fs(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))
    options = ParquetFactoryOptions(

    factory = ParquetDatasetFactory(metadata_path,
    return factory.finish(schema)