Exemple #1
def _ensure_fs_and_paths(path_or_paths, filesystem=None):
    # Validate and convert the path-likes and filesystem.
    # Returns filesystem and list of string paths or FileSelector
    from pyarrow.fs import FileSystem, FileType, FileSelector

    if isinstance(path_or_paths, list):
        paths_or_selector = [_stringify_path(path) for path in path_or_paths]
        if filesystem is None:
            # infer from first path
            filesystem, _ = FileSystem.from_uri(paths_or_selector[0])
        path = _stringify_path(path_or_paths)
        if filesystem is None:
            filesystem, path = FileSystem.from_uri(path)

        stats = filesystem.get_target_stats([path])[0]
        if stats.type == FileType.Directory:
            # for directory, pass a selector
            paths_or_selector = FileSelector(path, recursive=True)
        elif stats.type == FileType.File:
            # for a single file path, pass it as a list
            paths_or_selector = [path]
            raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Exemple #2
def _get_filesystem_path(path, filesystem=None, storage_options=None):
    Get the filesystem and path for a given filesystem and path.

    If the filesystem is not None then it's just returned as is.
    import pyarrow

    if (isinstance(path, str) and storage_options is None
            and filesystem is None
            and Version(pyarrow.__version__) >= Version("5.0.0")):
        # Use the native pyarrow filesystem if possible.
            from pyarrow.fs import FileSystem

            filesystem, path = FileSystem.from_uri(path)
        except Exception:
            # fallback to use get_handle / fsspec for filesystems
            # that pyarrow doesn't support

    if _is_fsspec_url(path) and filesystem is None:
        fsspec = import_optional_dependency(
            "fsspec", extra="fsspec is requred for 'storage_options'.")
        filesystem, path = fsspec.core.url_to_fs(path, **(storage_options
                                                          or {}))

    if filesystem is None and storage_options:
        raise ValueError(
            "Cannot provide 'storage_options' with non-fsspec path '{}'".

    return filesystem, path
Exemple #3
 def __iter__(self):
     shuffler = RandomShuffler(
         self.shuffler_capacity if self.shuffle else 1, self.seed)
     group_count = 0
     for filepath in self.files:
         fs, path = FileSystem.from_uri(filepath)
         with fs.open_input_file(path) as fobj:
             parquet = pg.ParquetFile(fobj)
             for group_idx in range(parquet.num_row_groups):
                 # A simple form of row-group level bucketing without memory overhead.
                 # Pros:
                 #  - It requires zero communication to initialize the distributed policy
                 #  - It uses little memory and no startup overhead, i.e. collecting row groups.
                 # Cons:
                 #   The drawback would be if the world size is much larger than
                 #   the average number of row groups. As a result, many of the
                 #   file open operations would be wasted.
                 group_count += 1
                 if group_count % self.world_size != self.rank:
                 row_group = parquet.read_row_group(group_idx,
                 for batch in row_group.to_batches():  # type: RecordBatch
                     # TODO: read batches not using pandas
                     for _, row in batch.to_pandas().iterrows():
                         # Maintain the shuffler buffer around its capacity.
                         while shuffler.full():
                             yield self._convert(shuffler.pop().to_dict(),
     while shuffler:
         yield self._convert(shuffler.pop().to_dict(),
Exemple #4
def _ensure_fs(fs_or_uri):
    from pyarrow.fs import (FileSystem, LocalFileSystem, SubTreeFileSystem,
                            FileType, _ensure_filesystem)

    if isinstance(fs_or_uri, str):
        # instantiate the file system from an uri, if the uri has a path
        # component then it will be treated as a path prefix
        filesystem, prefix = FileSystem.from_uri(fs_or_uri)
        is_local = isinstance(filesystem, LocalFileSystem)
        prefix = filesystem.normalize_path(prefix)
        if prefix:
            # validate that the prefix is pointing to a directory
            prefix_info = filesystem.get_file_info([prefix])[0]
            if prefix_info.type != FileType.Directory:
                raise ValueError(
                    "The path component of the filesystem URI must point to a "
                    "directory but it has a type: `{}`. The path component "
                    "is `{}` and the given filesystem URI is `{}`".format(
                        prefix_info.type.name, prefix_info.path, fs_or_uri))
            filesystem = SubTreeFileSystem(prefix, filesystem)
        return filesystem, is_local

        filesystem = _ensure_filesystem(fs_or_uri)
    except TypeError:
        raise TypeError(
            '`filesystem` argument must be a FileSystem instance or a valid '
            'file system URI')
    if isinstance(filesystem, (LocalFileSystem, _MockFileSystem)):
        return filesystem, True
        return filesystem, False
Exemple #5
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (FileSystem, LocalFileSystem, FileType,

    if filesystem is None:
        # First check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            local_path_exists = False
            local_path_exists = (infos.type != FileType.NotFound)

        if not local_path_exists:
            # Perhaps it's a URI?
                return FileSystem.from_uri(path)
            except ValueError as e:
                if "empty scheme" not in str(e):
                # ARROW-8213: not a URI, assume local path
                # to get a nice error message.

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #6
def test_open_dataset_from_uri_s3(s3_connection, s3_server):
    # open dataset from non-localfs string path
    from pyarrow.fs import FileSystem
    import pyarrow.parquet as pq

    host, port, access_key, secret_key = s3_connection
    uri = (
        .format(access_key, secret_key, host, port)

    fs, path = FileSystem.from_uri(uri)

    table = pa.table({'a': [1, 2, 3]})
    with fs.open_output_stream("mybucket/data.parquet") as out:
        pq.write_table(table, out)

    # full string URI
    dataset = ds.dataset(uri, format="parquet")
    assert dataset.to_table().equals(table)

    # passing filesystem object
    dataset = ds.dataset(path, format="parquet", filesystem=fs)
    assert dataset.to_table().equals(table)
Exemple #7
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import FileSystem

    if filesystem is not None:
        return filesystem, path
    return FileSystem.from_uri(path)
Exemple #8
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, FileType, _normalize_path)

    if filesystem is None:
        # first check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            return FileSystem.from_uri(path)

        if infos.type == FileType.NotFound:
            return FileSystem.from_uri(path)

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemple #9
def test_s3_real_aws_region_selection():
    # Taken from a registry of open S3-hosted datasets
    # at https://github.com/awslabs/open-data-registry
    fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt')
    assert fs.region == 'eu-west-1'
    with fs.open_input_stream(path) as f:
        assert b"Meteo-France Atmospheric models on AWS" in f.read(50)

    # Passing an explicit region disables auto-selection
    fs, path = FileSystem.from_uri(
    assert fs.region == 'us-east-2'
    # Reading from the wrong region may still work for public buckets...

    # Non-existent bucket (hopefully, otherwise need to fix this test)
    with pytest.raises(IOError, match="Bucket '.*' not found"):
    fs, path = FileSystem.from_uri(
    assert fs.region == 'us-east-3'
Exemple #10
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import FileSystem, LocalFileSystem

    if filesystem is None:
            filesystem, _ = FileSystem.from_uri(path)
        except Exception:
            # when path is not found, we fall back to local file system
            filesystem = LocalFileSystem()
    return filesystem
Exemple #11
def s3_example_fs(s3_connection, s3_server):
    from pyarrow.fs import FileSystem

    host, port, access_key, secret_key = s3_connection
    uri = (
        format(access_key, secret_key, host, port))
    fs, path = FileSystem.from_uri(uri)


    yield fs, uri, path
Exemple #12
    def resolve(self, uri: str) -> Iterable[str]:
        """Resolve dataset via a filesystem URI."""
        uri = normalize_uri(uri)
        parsed = urlparse(uri)

        fs, base_dir = FileSystem.from_uri(uri)
        # base_dir = parsed.netloc + parsed.path
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)
        scheme = parsed.scheme if parsed.scheme else "file"
        return [
            scheme + "://" + finfo.path for finfo in fs.get_file_info(selector)
            if finfo.path.endswith(".parquet")
Exemple #13
def test_filesystem_from_uri_s3(minio_server):
    from pyarrow.fs import S3FileSystem

    address, access_key, secret_key = minio_server
    uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}" \
        .format(access_key, secret_key, urllib.parse.quote(address))

    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, S3FileSystem)
    assert path == "mybucket/foo/bar"

    [st] = fs.get_target_stats([path])
    assert st.path == path
    assert st.type == FileType.Directory
Exemple #14
def test_filesystem_from_uri_s3(s3_connection, s3_server):
    from pyarrow.fs import S3FileSystem

    host, port, access_key, secret_key = s3_connection

    uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \
        .format(access_key, secret_key, host, port)

    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, S3FileSystem)
    assert path == "mybucket/foo/bar"

    [info] = fs.get_file_info([path])
    assert info.path == path
    assert info.type == FileType.Directory
Exemple #15
    def get_schema(self, uri: str):
        fs, base_dir = FileSystem.from_uri(normalize_uri(uri))
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)

        first_parquet = None
        for finfo in fs.get_file_info(selector):
            if finfo.path.endswith(".parquet"):
                first_parquet = finfo.path
        metadata_file = fs.open_input_file(first_parquet)
        metadata = pq.read_metadata(metadata_file)

        kv_metadata = metadata.metadata
            return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA])
        except KeyError as exp:
            raise ValueError(
                f"Parquet dataset {uri} is not created via Spark") from exp
Exemple #16
    def __init__(self,
                 filesystem: FileSystem,
                 path: str,
                 mode: str = "rb",
                 encoding: Optional[str] = "utf-8"):
        """ HDFSFile construct

            filesystem : FileSystem instance
            path : Path to file
            mode : read or write mode.
                    Supported: "r", "rb" (default), "w", "wb".
        self.filesystem = filesystem
        self.path = path
        self.mode = mode
        self.encoding = None if "b" in mode else encoding
        self._file = filesystem.open(self.path,
                                         "r": "rb",
                                         "w": "wb"
                                     }.get(mode, mode))
Exemple #17
def test_filesystem_from_uri(uri, expected_klass, expected_path):
    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, expected_klass)
    assert path == expected_path
Exemple #18
def test_cannot_instantiate_base_filesystem():
    with pytest.raises(TypeError):
Exemple #19
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
    from pyarrow.fs import FileSystem
    import pyarrow.parquet as pq

    host, port, access_key, secret_key = s3_connection
    bucket = 'theirbucket'
    path = 'nested/folder/data.parquet'
    uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
        access_key, secret_key, bucket, path, host, port

    fs, path = FileSystem.from_uri(uri)
    assert path == 'theirbucket/nested/folder/data.parquet'


    table = pa.table({'a': [1, 2, 3]})
    with fs.open_output_stream(path) as out:
        pq.write_table(table, out)

    # full string URI
    dataset = ds.dataset(uri, format="parquet")
    assert dataset.to_table().equals(table)

    # passing filesystem as an uri
    template = (
            access_key, secret_key, host, port
    cases = [
        ('theirbucket/nested/folder/', '/data.parquet'),
        ('theirbucket/nested/folder', 'data.parquet'),
        ('theirbucket/nested/', 'folder/data.parquet'),
        ('theirbucket/nested', 'folder/data.parquet'),
        ('theirbucket', '/nested/folder/data.parquet'),
        ('theirbucket', 'nested/folder/data.parquet'),
    for prefix, path in cases:
        uri = template.format(prefix)
        dataset = ds.dataset(path, filesystem=uri, format="parquet")
        assert dataset.to_table().equals(table)

    with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'):
        uri = template.format('/')
        ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri)

    error = (
        "The path component of the filesystem URI must point to a directory "
        "but it has a type: `{}`. The path component is `{}` and the given "
        "filesystem URI is `{}`"

    path = 'theirbucket/doesnt/exist'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('NotFound', path, uri)

    path = 'theirbucket/nested/folder/data.parquet'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('File', path, uri)
Exemple #20
def _ensure_single_source(path, filesystem=None):
    Treat path as either a recursively traversable directory or a single file.

    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

        If the passed filesystem has wrong type.
        If the referenced file or directory doesn't exist.
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_fs(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Exemple #21
 def __init__(self):
     self._client, _ = FileSystem.from_uri(Envs.HDFS_SERVER)
Exemple #22
def test_filesystem_from_path_object(path):
    p = pathlib.Path(path)
    fs, path = FileSystem.from_uri(p)
    assert isinstance(fs, LocalFileSystem)
    assert path == p.resolve().absolute().as_posix()