Beispiel #1
0
def _ensure_fs_and_paths(path_or_paths, filesystem=None):
    # Validate and convert the path-likes and filesystem.
    # Returns filesystem and list of string paths or FileSelector
    from pyarrow.fs import FileSystem, FileType, FileSelector

    if isinstance(path_or_paths, list):
        paths_or_selector = [_stringify_path(path) for path in path_or_paths]
        if filesystem is None:
            # infer from first path
            filesystem, _ = FileSystem.from_uri(paths_or_selector[0])
    else:
        path = _stringify_path(path_or_paths)
        if filesystem is None:
            filesystem, path = FileSystem.from_uri(path)

        stats = filesystem.get_target_stats([path])[0]
        if stats.type == FileType.Directory:
            # for directory, pass a selector
            paths_or_selector = FileSelector(path, recursive=True)
        elif stats.type == FileType.File:
            # for a single file path, pass it as a list
            paths_or_selector = [path]
        else:
            raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Beispiel #2
0
def _get_filesystem_path(path, filesystem=None, storage_options=None):
    """
    Get the filesystem and path for a given filesystem and path.

    If the filesystem is not None then it's just returned as is.
    """
    import pyarrow

    if (isinstance(path, str) and storage_options is None
            and filesystem is None
            and Version(pyarrow.__version__) >= Version("5.0.0")):
        # Use the native pyarrow filesystem if possible.
        try:
            from pyarrow.fs import FileSystem

            filesystem, path = FileSystem.from_uri(path)
        except Exception:
            # fallback to use get_handle / fsspec for filesystems
            # that pyarrow doesn't support
            pass

    if _is_fsspec_url(path) and filesystem is None:
        fsspec = import_optional_dependency(
            "fsspec", extra="fsspec is requred for 'storage_options'.")
        filesystem, path = fsspec.core.url_to_fs(path, **(storage_options
                                                          or {}))

    if filesystem is None and storage_options:
        raise ValueError(
            "Cannot provide 'storage_options' with non-fsspec path '{}'".
            format(path))

    return filesystem, path
Beispiel #3
0
 def __iter__(self):
     shuffler = RandomShuffler(
         self.shuffler_capacity if self.shuffle else 1, self.seed)
     group_count = 0
     for filepath in self.files:
         fs, path = FileSystem.from_uri(filepath)
         with fs.open_input_file(path) as fobj:
             parquet = pg.ParquetFile(fobj)
             for group_idx in range(parquet.num_row_groups):
                 # A simple form of row-group level bucketing without memory overhead.
                 # Pros:
                 #  - It requires zero communication to initialize the distributed policy
                 #  - It uses little memory and no startup overhead, i.e. collecting row groups.
                 # Cons:
                 #   The drawback would be if the world size is much larger than
                 #   the average number of row groups. As a result, many of the
                 #   file open operations would be wasted.
                 group_count += 1
                 if group_count % self.world_size != self.rank:
                     continue
                 row_group = parquet.read_row_group(group_idx,
                                                    columns=self.columns)
                 for batch in row_group.to_batches():  # type: RecordBatch
                     # TODO: read batches not using pandas
                     for _, row in batch.to_pandas().iterrows():
                         shuffler.append(row)
                         # Maintain the shuffler buffer around its capacity.
                         while shuffler.full():
                             yield self._convert(shuffler.pop().to_dict(),
                                                 self.spark_row_metadata)
     while shuffler:
         yield self._convert(shuffler.pop().to_dict(),
                             self.spark_row_metadata)
Beispiel #4
0
def _ensure_fs(fs_or_uri):
    from pyarrow.fs import (FileSystem, LocalFileSystem, SubTreeFileSystem,
                            FileType, _ensure_filesystem)

    if isinstance(fs_or_uri, str):
        # instantiate the file system from an uri, if the uri has a path
        # component then it will be treated as a path prefix
        filesystem, prefix = FileSystem.from_uri(fs_or_uri)
        is_local = isinstance(filesystem, LocalFileSystem)
        prefix = filesystem.normalize_path(prefix)
        if prefix:
            # validate that the prefix is pointing to a directory
            prefix_info = filesystem.get_file_info([prefix])[0]
            if prefix_info.type != FileType.Directory:
                raise ValueError(
                    "The path component of the filesystem URI must point to a "
                    "directory but it has a type: `{}`. The path component "
                    "is `{}` and the given filesystem URI is `{}`".format(
                        prefix_info.type.name, prefix_info.path, fs_or_uri))
            filesystem = SubTreeFileSystem(prefix, filesystem)
        return filesystem, is_local

    try:
        filesystem = _ensure_filesystem(fs_or_uri)
    except TypeError:
        raise TypeError(
            '`filesystem` argument must be a FileSystem instance or a valid '
            'file system URI')
    if isinstance(filesystem, (LocalFileSystem, _MockFileSystem)):
        return filesystem, True
    else:
        return filesystem, False
Beispiel #5
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (FileSystem, LocalFileSystem, FileType,
                            _normalize_path)

    if filesystem is None:
        # First check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            local_path_exists = False
        else:
            local_path_exists = (infos.type != FileType.NotFound)

        if not local_path_exists:
            # Perhaps it's a URI?
            try:
                return FileSystem.from_uri(path)
            except ValueError as e:
                if "empty scheme" not in str(e):
                    raise
                # ARROW-8213: not a URI, assume local path
                # to get a nice error message.

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Beispiel #6
0
def test_open_dataset_from_uri_s3(s3_connection, s3_server):
    # open dataset from non-localfs string path
    from pyarrow.fs import FileSystem
    import pyarrow.parquet as pq

    host, port, access_key, secret_key = s3_connection
    uri = (
        "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
        .format(access_key, secret_key, host, port)
    )

    fs, path = FileSystem.from_uri(uri)

    fs.create_dir("mybucket")
    table = pa.table({'a': [1, 2, 3]})
    with fs.open_output_stream("mybucket/data.parquet") as out:
        pq.write_table(table, out)

    # full string URI
    dataset = ds.dataset(uri, format="parquet")
    assert dataset.to_table().equals(table)

    # passing filesystem object
    dataset = ds.dataset(path, format="parquet", filesystem=fs)
    assert dataset.to_table().equals(table)
Beispiel #7
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import FileSystem

    if filesystem is not None:
        return filesystem, path
    return FileSystem.from_uri(path)
Beispiel #8
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, FileType, _normalize_path)

    if filesystem is None:
        # first check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            return FileSystem.from_uri(path)

        if infos.type == FileType.NotFound:
            return FileSystem.from_uri(path)

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Beispiel #9
0
def test_s3_real_aws_region_selection():
    # Taken from a registry of open S3-hosted datasets
    # at https://github.com/awslabs/open-data-registry
    fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt')
    assert fs.region == 'eu-west-1'
    with fs.open_input_stream(path) as f:
        assert b"Meteo-France Atmospheric models on AWS" in f.read(50)

    # Passing an explicit region disables auto-selection
    fs, path = FileSystem.from_uri(
        's3://mf-nwp-models/README.txt?region=us-east-2')
    assert fs.region == 'us-east-2'
    # Reading from the wrong region may still work for public buckets...

    # Non-existent bucket (hopefully, otherwise need to fix this test)
    with pytest.raises(IOError, match="Bucket '.*' not found"):
        FileSystem.from_uri('s3://x-arrow-non-existent-bucket')
    fs, path = FileSystem.from_uri(
        's3://x-arrow-non-existent-bucket?region=us-east-3')
    assert fs.region == 'us-east-3'
Beispiel #10
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import FileSystem, LocalFileSystem

    if filesystem is None:
        try:
            filesystem, _ = FileSystem.from_uri(path)
        except Exception:
            # when path is not found, we fall back to local file system
            filesystem = LocalFileSystem()
    return filesystem
Beispiel #11
0
def s3_example_fs(s3_connection, s3_server):
    from pyarrow.fs import FileSystem

    host, port, access_key, secret_key = s3_connection
    uri = (
        "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}".
        format(access_key, secret_key, host, port))
    fs, path = FileSystem.from_uri(uri)

    fs.create_dir("mybucket")

    yield fs, uri, path
Beispiel #12
0
    def resolve(self, uri: str) -> Iterable[str]:
        """Resolve dataset via a filesystem URI."""
        uri = normalize_uri(uri)
        parsed = urlparse(uri)

        fs, base_dir = FileSystem.from_uri(uri)
        # base_dir = parsed.netloc + parsed.path
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)
        scheme = parsed.scheme if parsed.scheme else "file"
        return [
            scheme + "://" + finfo.path for finfo in fs.get_file_info(selector)
            if finfo.path.endswith(".parquet")
        ]
Beispiel #13
0
def test_filesystem_from_uri_s3(minio_server):
    from pyarrow.fs import S3FileSystem

    address, access_key, secret_key = minio_server
    uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}" \
        .format(access_key, secret_key, urllib.parse.quote(address))

    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, S3FileSystem)
    assert path == "mybucket/foo/bar"

    fs.create_dir(path)
    [st] = fs.get_target_stats([path])
    assert st.path == path
    assert st.type == FileType.Directory
Beispiel #14
0
def test_filesystem_from_uri_s3(s3_connection, s3_server):
    from pyarrow.fs import S3FileSystem

    host, port, access_key, secret_key = s3_connection

    uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \
        .format(access_key, secret_key, host, port)

    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, S3FileSystem)
    assert path == "mybucket/foo/bar"

    fs.create_dir(path)
    [info] = fs.get_file_info([path])
    assert info.path == path
    assert info.type == FileType.Directory
Beispiel #15
0
    def get_schema(self, uri: str):
        fs, base_dir = FileSystem.from_uri(normalize_uri(uri))
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)

        first_parquet = None
        for finfo in fs.get_file_info(selector):
            if finfo.path.endswith(".parquet"):
                first_parquet = finfo.path
                break
        metadata_file = fs.open_input_file(first_parquet)
        metadata = pq.read_metadata(metadata_file)

        kv_metadata = metadata.metadata
        try:
            return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA])
        except KeyError as exp:
            raise ValueError(
                f"Parquet dataset {uri} is not created via Spark") from exp
Beispiel #16
0
    def __init__(self,
                 filesystem: FileSystem,
                 path: str,
                 mode: str = "rb",
                 encoding: Optional[str] = "utf-8"):
        """ HDFSFile construct

        Args:
            filesystem : FileSystem instance
            path : Path to file
            mode : read or write mode.
                    Supported: "r", "rb" (default), "w", "wb".
        """
        self.filesystem = filesystem
        self.path = path
        self.mode = mode
        self.encoding = None if "b" in mode else encoding
        self._file = filesystem.open(self.path,
                                     mode={
                                         "r": "rb",
                                         "w": "wb"
                                     }.get(mode, mode))
Beispiel #17
0
def test_filesystem_from_uri(uri, expected_klass, expected_path):
    fs, path = FileSystem.from_uri(uri)
    assert isinstance(fs, expected_klass)
    assert path == expected_path
Beispiel #18
0
def test_cannot_instantiate_base_filesystem():
    with pytest.raises(TypeError):
        FileSystem()
Beispiel #19
0
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
    from pyarrow.fs import FileSystem
    import pyarrow.parquet as pq

    host, port, access_key, secret_key = s3_connection
    bucket = 'theirbucket'
    path = 'nested/folder/data.parquet'
    uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
        access_key, secret_key, bucket, path, host, port
    )

    fs, path = FileSystem.from_uri(uri)
    assert path == 'theirbucket/nested/folder/data.parquet'

    fs.create_dir(bucket)

    table = pa.table({'a': [1, 2, 3]})
    with fs.open_output_stream(path) as out:
        pq.write_table(table, out)

    # full string URI
    dataset = ds.dataset(uri, format="parquet")
    assert dataset.to_table().equals(table)

    # passing filesystem as an uri
    template = (
        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
            access_key, secret_key, host, port
        )
    )
    cases = [
        ('theirbucket/nested/folder/', '/data.parquet'),
        ('theirbucket/nested/folder', 'data.parquet'),
        ('theirbucket/nested/', 'folder/data.parquet'),
        ('theirbucket/nested', 'folder/data.parquet'),
        ('theirbucket', '/nested/folder/data.parquet'),
        ('theirbucket', 'nested/folder/data.parquet'),
    ]
    for prefix, path in cases:
        uri = template.format(prefix)
        dataset = ds.dataset(path, filesystem=uri, format="parquet")
        assert dataset.to_table().equals(table)

    with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'):
        uri = template.format('/')
        ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri)

    error = (
        "The path component of the filesystem URI must point to a directory "
        "but it has a type: `{}`. The path component is `{}` and the given "
        "filesystem URI is `{}`"
    )

    path = 'theirbucket/doesnt/exist'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('NotFound', path, uri)

    path = 'theirbucket/nested/folder/data.parquet'
    uri = template.format(path)
    with pytest.raises(ValueError) as exc:
        ds.dataset('data.parquet', filesystem=uri)
    assert str(exc.value) == error.format('File', path, uri)
Beispiel #20
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

   Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
        try:
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
        else:
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
            try:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                    raise
            else:
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_fs(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Beispiel #21
0
 def __init__(self):
     self._client, _ = FileSystem.from_uri(Envs.HDFS_SERVER)
Beispiel #22
0
def test_filesystem_from_path_object(path):
    p = pathlib.Path(path)
    fs, path = FileSystem.from_uri(p)
    assert isinstance(fs, LocalFileSystem)
    assert path == p.resolve().absolute().as_posix()