Beispiel #1
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileType, FileSelector, _resolve_filesystem_and_path

    # at this point we already checked that `path` is a path-like
    filesystem, path = _resolve_filesystem_and_path(path, filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    file_info = filesystem.get_file_info(path)

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Beispiel #2
0
    def get_schema(self, uri: str):
        fs, base_dir = FileSystem.from_uri(normalize_uri(uri))
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)

        first_parquet = None
        for finfo in fs.get_file_info(selector):
            if finfo.path.endswith(".parquet"):
                first_parquet = finfo.path
                break
        metadata_file = fs.open_input_file(first_parquet)
        metadata = pq.read_metadata(metadata_file)

        kv_metadata = metadata.metadata
        try:
            return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA])
        except KeyError as exp:
            raise ValueError(
                f"Parquet dataset {uri} is not created via Spark") from exp
Beispiel #3
0
def test_s3_real_aws():
    # Exercise connection code with an AWS-backed S3 bucket.
    # This is a minimal integration check for ARROW-9261 and similar issues.
    from pyarrow.fs import S3FileSystem
    default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1')
    fs = S3FileSystem(anonymous=True)
    assert fs.region == default_region

    fs = S3FileSystem(anonymous=True, region='us-east-2')
    entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data'))
    assert len(entries) > 0
    with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f:
        md = f.metadata()
        assert 'Content-Type' in md
        assert md['Last-Modified'] == b'2020-01-17T16:26:28Z'
        # For some reason, the header value is quoted
        # (both with AWS and Minio)
        assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
Beispiel #4
0
def test_hdfs_options(hdfs_server):
    from pyarrow.fs import HdfsOptions, HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    options = HdfsOptions()
    assert options.endpoint == ('', 0)
    options.endpoint = ('localhost', 8080)
    assert options.endpoint == ('localhost', 8080)
    with pytest.raises(TypeError):
        options.endpoint = 'localhost:8000'

    assert options.driver == 'libhdfs'
    options.driver = 'libhdfs3'
    assert options.driver == 'libhdfs3'
    with pytest.raises(ValueError):
        options.driver = 'unknown'

    assert options.replication == 3
    options.replication = 2
    assert options.replication == 2

    assert options.user == ''
    options.user = '******'
    assert options.user == 'libhdfs'

    assert options.default_block_size == 0
    options.default_block_size = 128*1024**2
    assert options.default_block_size == 128*1024**2

    assert options.buffer_size == 0
    options.buffer_size = 64*1024
    assert options.buffer_size == 64*1024

    options = HdfsOptions.from_uri('hdfs://localhost:8080/?user=test')
    assert options.endpoint == ('hdfs://localhost', 8080)
    assert options.user == 'test'

    host, port, user = hdfs_server
    uri = "hdfs://{}:{}/?user={}".format(host, port, user)
    fs = HadoopFileSystem(uri)
    assert fs.get_target_stats(FileSelector('/'))
Beispiel #5
0
    def _get_dataset(self, asset):
        # FIXME(roee88): bypass https://issues.apache.org/jira/browse/ARROW-7867
        selector = FileSelector(asset.path,
                                allow_not_found=True,
                                recursive=True)
        try:
            data_files = [
                f.path for f in asset.filesystem.get_file_info(selector)
                if f.size
            ]
        except NotADirectoryError:
            data_files = None
        if not data_files:
            data_files = [asset.path]  # asset.path is probably a single file

        if asset.format == "csv" or asset.format == "parquet":
            return ds.dataset(data_files,
                              format=asset.format,
                              filesystem=asset.filesystem)

        raise ValueError("unsupported format {}".format(asset.format))
Beispiel #6
0
def _expand_directory(
    path: str,
    filesystem: "pyarrow.fs.FileSystem",
    exclude_prefixes: Optional[List[str]] = None,
) -> List[str]:
    """
    Expand the provided directory path to a list of file paths.

    Args:
        path: The directory path to expand.
        filesystem: The filesystem implementation that should be used for
            reading these files.
        exclude_prefixes: The file relative path prefixes that should be
            excluded from the returned file set. Default excluded prefixes are
            "." and "_".

    Returns:
        A list of file paths contained in the provided directory.
    """
    if exclude_prefixes is None:
        exclude_prefixes = [".", "_"]

    from pyarrow.fs import FileSelector

    selector = FileSelector(path, recursive=True)
    files = filesystem.get_file_info(selector)
    base_path = selector.base_dir
    filtered_paths = []
    for file_ in files:
        if not file_.is_file:
            continue
        file_path = file_.path
        if not file_path.startswith(base_path):
            continue
        relative = file_path[len(base_path) :]
        if any(relative.startswith(prefix) for prefix in exclude_prefixes):
            continue
        filtered_paths.append((file_path, file_))
    # We sort the paths to guarantee a stable order.
    return zip(*sorted(filtered_paths, key=lambda x: x[0]))
Beispiel #7
0
def test_get_file_info_with_selector(fs, pathfn):
    skip_fsspec_s3fs(fs)

    base_dir = pathfn('selector-dir/')
    file_a = pathfn('selector-dir/test_file_a')
    file_b = pathfn('selector-dir/test_file_b')
    dir_a = pathfn('selector-dir/test_dir_a')

    try:
        fs.create_dir(base_dir)
        with fs.open_output_stream(file_a):
            pass
        with fs.open_output_stream(file_b):
            pass
        fs.create_dir(dir_a)

        selector = FileSelector(base_dir,
                                allow_not_found=False,
                                recursive=True)
        assert selector.base_dir == base_dir

        infos = fs.get_file_info(selector)
        assert len(infos) == 3

        for info in infos:
            if info.path.endswith(file_a):
                assert info.type == FileType.File
            elif info.path.endswith(file_b):
                assert info.type == FileType.File
            elif info.path.rstrip("/").endswith(dir_a):
                assert info.type == FileType.Directory
            else:
                raise ValueError('unexpected path {}'.format(info.path))
            check_mtime_or_absent(info)
    finally:
        fs.delete_file(file_a)
        fs.delete_file(file_b)
        fs.delete_dir(dir_a)
        fs.delete_dir(base_dir)
Beispiel #8
0
def _ensure_fs_and_paths(path_or_paths, filesystem=None):
    # Validate and convert the path-likes and filesystem.
    # Returns filesystem and list of string paths or FileSelector
    from pyarrow.fs import FileType, FileSelector

    if isinstance(path_or_paths, list):
        paths_or_selector = [_stringify_path(path) for path in path_or_paths]
        # infer from first path
        filesystem = _ensure_fs(filesystem, paths_or_selector[0])
    else:
        path = _stringify_path(path_or_paths)
        filesystem = _ensure_fs(filesystem, path)
        stats = filesystem.get_target_stats([path])[0]
        if stats.type == FileType.Directory:
            # for directory, pass a selector
            paths_or_selector = FileSelector(path, recursive=True)
        elif stats.type == FileType.File:
            # for a single file path, pass it as a list
            paths_or_selector = [path]
        else:
            raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Beispiel #9
0
    def list(
        self,
        path: str,
    ) -> List["pyarrow.fs.FileInfo"]:
        """List blobs and sub-dirs in the given path, if possible.

        Examples:
            # List created blobs and dirs at <storage_prefix>/my_app/path
            >>> client = storage.get_client("my_app")
            >>> client.list("path")
            [<FileInfo for '/tmp/storage/my_app/path/foo.txt' type=FileType.File>,
             <FileInfo for '/tmp/storage/my_app/path/subdir' type=FileType.Directory>]

            # Non-existent path.
            >>> client.get_info("does_not_exist")
            FileNotFoundError: ...

            # Not a directory.
            >>> storage.get_info("path/foo.txt")
            NotADirectoryError: ...

        Args:
            path: Relative directory to list from.

        Returns:
            List of file-info objects for the directory contents.

        Raises:
            FileNotFoundError if the given path is not found.
            NotADirectoryError if the given path isn't a valid directory.
        """
        from pyarrow.fs import FileSelector

        full_path = self._resolve_path(path)
        selector = FileSelector(full_path, recursive=False)
        files = self.fs.get_file_info(selector)
        return files
Beispiel #10
0
def test_get_target_stats_with_selector(fs, pathfn):
    base_dir = pathfn('selector-dir/')
    file_a = pathfn('selector-dir/test_file_a')
    file_b = pathfn('selector-dir/test_file_b')
    dir_a = pathfn('selector-dir/test_dir_a')

    try:
        fs.create_dir(base_dir)
        with fs.open_output_stream(file_a):
            pass
        with fs.open_output_stream(file_b):
            pass
        fs.create_dir(dir_a)

        selector = FileSelector(base_dir,
                                allow_non_existent=False,
                                recursive=True)
        assert selector.base_dir == base_dir

        stats = fs.get_target_stats(selector)
        assert len(stats) == 3

        for st in stats:
            if st.path.endswith(file_a):
                assert st.type == FileType.File
            elif st.path.endswith(file_b):
                assert st.type == FileType.File
            elif st.path.endswith(dir_a):
                assert st.type == FileType.Directory
            else:
                raise ValueError('unexpected path {}'.format(st.path))
    finally:
        fs.delete_file(file_a)
        fs.delete_file(file_b)
        fs.delete_dir(dir_a)
        fs.delete_dir(base_dir)
Beispiel #11
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

   Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
        try:
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
        else:
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
            try:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                    raise
            else:
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_fs(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Beispiel #12
0
def test_hdfs_options(hdfs_connection):
    from pyarrow.fs import HadoopFileSystem
    if not pa.have_libhdfs():
        pytest.skip('Cannot locate libhdfs')

    host, port, user = hdfs_connection

    replication = 2
    buffer_size = 64 * 1024
    default_block_size = 128 * 1024**2
    uri = ('hdfs://{}:{}/?user={}&replication={}&buffer_size={}'
           '&default_block_size={}')

    hdfs1 = HadoopFileSystem(host,
                             port,
                             user='******',
                             replication=replication,
                             buffer_size=buffer_size,
                             default_block_size=default_block_size)
    hdfs2 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'libhdfs', replication, buffer_size,
                   default_block_size))
    hdfs3 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication, buffer_size,
                   default_block_size))
    hdfs4 = HadoopFileSystem.from_uri(
        uri.format(host, port, 'me', replication + 1, buffer_size,
                   default_block_size))
    hdfs5 = HadoopFileSystem(host, port)
    hdfs6 = HadoopFileSystem.from_uri('hdfs://{}:{}'.format(host, port))
    hdfs7 = HadoopFileSystem(host, port, user='******')
    hdfs8 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket="cache_path")
    hdfs9 = HadoopFileSystem(host,
                             port,
                             user='******',
                             kerb_ticket=pathlib.Path("cache_path"))
    hdfs10 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path2")
    hdfs11 = HadoopFileSystem(host,
                              port,
                              user='******',
                              kerb_ticket="cache_path",
                              extra_conf={'hdfs_token': 'abcd'})

    assert hdfs1 == hdfs2
    assert hdfs5 == hdfs6
    assert hdfs6 != hdfs7
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs5
    assert hdfs2 != hdfs3
    assert hdfs3 != hdfs4
    assert hdfs7 != hdfs8
    assert hdfs8 == hdfs9
    assert hdfs10 != hdfs9
    assert hdfs11 != hdfs8

    with pytest.raises(TypeError):
        HadoopFileSystem()
    with pytest.raises(TypeError):
        HadoopFileSystem.from_uri(3)

    for fs in [
            hdfs1, hdfs2, hdfs3, hdfs4, hdfs5, hdfs6, hdfs7, hdfs8, hdfs9,
            hdfs10, hdfs11
    ]:
        assert pickle.loads(pickle.dumps(fs)) == fs

    host, port, user = hdfs_connection

    hdfs = HadoopFileSystem(host, port, user=user)
    assert hdfs.get_file_info(FileSelector('/'))

    hdfs = HadoopFileSystem.from_uri("hdfs://{}:{}/?user={}".format(
        host, port, user))
    assert hdfs.get_file_info(FileSelector('/'))
Beispiel #13
0
def test_get_file_info_with_selector(fs, pathfn):
    base_dir = pathfn('selector-dir/')
    file_a = pathfn('selector-dir/test_file_a')
    file_b = pathfn('selector-dir/test_file_b')
    dir_a = pathfn('selector-dir/test_dir_a')
    file_c = pathfn('selector-dir/test_dir_a/test_file_c')
    dir_b = pathfn('selector-dir/test_dir_b')

    try:
        fs.create_dir(base_dir)
        with fs.open_output_stream(file_a):
            pass
        with fs.open_output_stream(file_b):
            pass
        fs.create_dir(dir_a)
        with fs.open_output_stream(file_c):
            pass
        fs.create_dir(dir_b)

        # recursive selector
        selector = FileSelector(base_dir,
                                allow_not_found=False,
                                recursive=True)
        assert selector.base_dir == base_dir

        infos = fs.get_file_info(selector)
        if fs.type_name == "py::fsspec+s3":
            # s3fs only lists directories if they are not empty, but depending
            # on the s3fs/fsspec version combo, it includes the base_dir
            # (https://github.com/dask/s3fs/issues/393)
            assert (len(infos) == 4) or (len(infos) == 5)
        else:
            assert len(infos) == 5

        for info in infos:
            if (info.path.endswith(file_a) or info.path.endswith(file_b)
                    or info.path.endswith(file_c)):
                assert info.type == FileType.File
            elif (info.path.rstrip("/").endswith(dir_a)
                  or info.path.rstrip("/").endswith(dir_b)):
                assert info.type == FileType.Directory
            elif (fs.type_name == "py::fsspec+s3"
                  and info.path.rstrip("/").endswith("selector-dir")):
                # s3fs can include base dir, see above
                assert info.type == FileType.Directory
            else:
                raise ValueError('unexpected path {}'.format(info.path))
            check_mtime_or_absent(info)

        # non-recursive selector -> not selecting the nested file_c
        selector = FileSelector(base_dir, recursive=False)

        infos = fs.get_file_info(selector)
        if fs.type_name == "py::fsspec+s3":
            # s3fs only lists directories if they are not empty
            # + for s3fs 0.5.2 all directories are dropped because of buggy
            # side-effect of previous find() call
            # (https://github.com/dask/s3fs/issues/410)
            assert (len(infos) == 3) or (len(infos) == 2)
        else:
            assert len(infos) == 4

    finally:
        fs.delete_dir(base_dir)
Beispiel #14
0
 def listdir(self, path):
     return [f.path for f in self._get_client().get_file_info(FileSelector(path))]