def get_remote_filesystem_and_path(
        protocol: str, uri: str, path: str,
        **storage_options: Dict[str, Any]) -> Tuple[AbstractFileSystem, str]:
    """
    For a given protocol, root uri, path and kwargs returns the appropriate FileSystem representation and the
    representation of path on this FileSystem required to access path.

    :param protocol: protocol of the target FileSystem, e.g. https, file, github
    :param uri: uri of the root of the FileSystem, from where path is expected to join
    :param path: path to the location in the context of the uri
    :param storage_options: arguments passed through to the FileSystem instantiation
    :return: The FileSystem and access path on this FileSystem
    """
    if protocol == "file":
        storage_options.setdefault("auto_mkdir", True)
        uri = Path(uri.replace("file://", "")) / Path(path)
        return LocalFileSystem(**storage_options), uri.as_posix()
    elif protocol in {"http", "https", "s3"}:
        # storage_options are parameters passed to request
        path = urllib.parse.quote(path)
        uri = "/".join(s.strip("/") for s in [uri, path])
        fs_class = S3FileSystem if protocol == "s3" else HTTPFileSystem
        return fs_class(**storage_options), uri
    elif protocol in {"sftp", "ssh", "ftp"}:
        inferred_options = infer_storage_options(uri)
        username = storage_options.pop(
            "username", None) or inferred_options.get("username")
        password = storage_options.pop(
            "password", None) or inferred_options.get("password")
        uri = (Path(inferred_options["path"]) / Path(path)).as_posix()
        fs_class = FTPFileSystem if protocol == "ftp" else SFTPFileSystem
        fs = fs_class(host=inferred_options["host"],
                      username=username,
                      password=password,
                      **storage_options)
        if protocol == "ftp":
            try:
                fs.ftp.dir()
            except (TimeoutError, socket.timeout):
                fs.ftp.set_pasv(False)
        return fs, uri
    elif protocol == "github":
        if re.match(r"\w+/\w+", uri):
            uri = f"github://{uri.split('/')[0]}:{uri.split('/')[1]}@master/"
        # infer options on a github uri reads the org, repo and sha incorrectly (as if it were an ftp uri)
        # this is because it uses urllib.parse.urlsplit under the hood
        inferred_options = infer_storage_options(uri)
        org = inferred_options.get("username")
        repo = inferred_options.get("password")
        sha = inferred_options.get("host") or "master"
        path = (Path(inferred_options.get("path", "")) / Path(path)).as_posix()
        return GithubFileSystem(org=org, repo=repo, sha=sha,
                                **storage_options), path
    else:
        raise NotImplementedError(
            f"Unsupported remote filesystem {protocol}:{uri}")
Esempio n. 2
0
    def wrap(self, url):
        """Wrap the normal URL in the form of azure://{url}/{something}
        to remote://{remote}/{something}"""
        from fsspec.utils import infer_storage_options

        url_params = infer_storage_options(url)
        remote_url_params = infer_storage_options(self.remote_options["base"])

        url = url_params["host"] + url_params["path"]
        remote_url = remote_url_params["host"] + remote_url_params["path"]

        if url.startswith(remote_url):
            return f"remote://{self.DEFAULT_REMOTE}/{url[len(remote_url):]}"
        else:
            raise ValueError(f"url {url!r} doesn't match with {remote_url!r}")
Esempio n. 3
0
    def _get_kwargs_from_urls(urlpath):
        from fsspec.utils import infer_storage_options

        options = infer_storage_options(urlpath)
        options.pop("path", None)
        options.pop("protocol", None)
        return options
Esempio n. 4
0
    def _strip_protocol(cls, path: str):
        bucket = infer_storage_options(path).get("host")
        if bucket:
            return path

        bucket = _az_config().get("storage", "container_name", None)
        return f"azure://{bucket}"
Esempio n. 5
0
def get_protocol_and_path(filepath: str,
                          version: Version = None) -> Tuple[str, str]:
    """Parses filepath on protocol and path.

    Args:
        filepath: raw filepath e.g.: `gcs://bucket/test.json`.
        version: instance of ``kedro.io.core.Version`` or None.

    Returns:
            Protocol and path.

    Raises:
            DataSetError: when protocol is http(s) and version is not None.
            Note: HTTP(s) dataset doesn't support versioning.
    """
    options_dict = infer_storage_options(filepath)
    path = options_dict["path"]
    protocol = options_dict["protocol"]

    if protocol in HTTP_PROTOCOLS:
        if version:
            raise DataSetError(
                "HTTP(s) DataSet doesn't support versioning. "
                "Please remove version flag from the dataset configuration.")
        path = path.split(PROTOCOL_DELIMITER, 1)[-1]

    return protocol, path
Esempio n. 6
0
    def _strip_protocol(cls, path: str):
        """
        Remove the protocol from the input path

        Parameters
        ----------
        path: str
            Path to remove the protocol from

        Returns
        -------
        str
            Returns a path without the protocol
        """
        logging.debug(f"_strip_protocol for {path}")
        ops = infer_storage_options(path)

        # we need to make sure that the path retains
        # the format {host}/{path}
        # here host is the container_name
        if ops.get("host", None):
            ops["path"] = ops["host"] + ops["path"]
        ops["path"] = ops["path"].lstrip("/")

        logging.debug(f"_strip_protocol({path}) = {ops}")
        return ops["path"]
Esempio n. 7
0
 def _get_kwargs_from_urls(paths):
     """ Get the store_name from the urlpath and pass to storage_options """
     ops = infer_storage_options(paths)
     out = {}
     if ops.get("host", None):
         out["store_name"] = ops["host"]
     return out
Esempio n. 8
0
def _prepare_file_arg(
    file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any
) -> ContextManager[Union[str, BinaryIO, List[str], List[BinaryIO]]]:
    """
    Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]).
    Returned value is always usable as a context.

    A `StringIO`, `BytesIO` file is returned as a `BytesIO`.
    A local path is returned as a string.
    An http URL is read into a buffer and returned as a `BytesIO`.

    When fsspec is installed, remote file(s) is (are) opened with
    `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`.
    """

    # Small helper to use a variable as context
    @contextmanager
    def managed_file(file: Any) -> Iterator[Any]:
        try:
            yield file
        finally:
            pass

    if isinstance(file, StringIO):
        return BytesIO(file.read().encode("utf8"))
    if isinstance(file, BytesIO):
        return managed_file(file)
    if isinstance(file, Path):
        return managed_file(format_path(file))
    if isinstance(file, str):
        if _WITH_FSSPEC:
            if infer_storage_options(file)["protocol"] == "file":
                return managed_file(format_path(file))
            return fsspec.open(file, **kwargs)
        if file.startswith("http"):
            return _process_http_file(file)
    if isinstance(file, list) and bool(file) and all(
            isinstance(f, str) for f in file):
        if _WITH_FSSPEC:
            if all(
                    infer_storage_options(f)["protocol"] == "file"
                    for f in file):
                return managed_file([format_path(f) for f in file])
            return fsspec.open_files(file, **kwargs)
    if isinstance(file, str):
        file = format_path(file)
    return managed_file(file)
Esempio n. 9
0
 def _strip_protocol(cls, path):
     ops = infer_storage_options(path)
     path = ops["path"]
     # infer_store_options leaves file:/ prefixes alone
     # for local hdfs instances
     if path.startswith("file:"):
         path = path[5:]
     return path
Esempio n. 10
0
 def _get_kwargs_from_urls(paths):
     """ Get the store_name from the urlpath and pass to storage_options """
     logging.debug("Getting kwargs from urls...")
     ops = infer_storage_options(paths)
     out = {}
     if ops.get("host", None):
         out["container_name"] = ops["host"]
     logging.debug(f"kwargs are:  {out}")
     return out
Esempio n. 11
0
    def __init__(
        self,
        filepath: str,
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
        version: Version = None,
        credentials: Dict[str, Any] = None,
        fs_args: Dict[str, Any] = None,
    ) -> None:
        """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
        on a specific filesystem.

        Args:
            filepath: Filepath to a JSON file prefixed with a protocol like `s3://`.
                If prefix is not provided `file` protocol (local filesystem) will be used.
                The prefix should be any protocol supported by ``fsspec`` except `http(s)`.
            load_args: Pandas options for loading JSON files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
                All defaults are preserved.
            save_args: Pandas options for saving JSON files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
                All defaults are preserved, but "index", which is set to False.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.
                https://cloud.google.com/resource-manager/docs/creating-managing-projects
            credentials: Credentials required to get access to the underlying filesystem.
                E.g. for ``GCSFileSystem`` it should look like `{'token': None}`.
            fs_args: Extra arguments to pass into underlying filesystem class.
                E.g. for ``GCSFileSystem`` class: `{project: 'my-project', ...}`
        """
        _fs_args = deepcopy(fs_args) or {}
        _credentials = deepcopy(credentials) or {}
        options_dict = infer_storage_options(filepath)

        self._protocol = options_dict["protocol"]
        self._fs = fsspec.filesystem(self._protocol, **_credentials,
                                     **_fs_args)

        super().__init__(
            filepath=PurePosixPath(options_dict["path"]),
            version=version,
            exists_function=self._fs.exists,
            glob_function=self._fs.glob,
        )

        # Handle default load and save arguments
        self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
        if load_args is not None:
            self._load_args.update(load_args)
        self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
        if save_args is not None:
            self._save_args.update(save_args)
Esempio n. 12
0
 def _get_kwargs_from_urls(path):
     ops = infer_storage_options(path)
     out = {}
     if ops.get("host", None):
         out["host"] = ops["host"]
     if ops.get("username", None):
         out["user"] = ops["username"]
     if ops.get("port", None):
         out["port"] = ops["port"]
     return out
Esempio n. 13
0
    def _strip_protocol(cls, path):
        ops = infer_storage_options(path)

        # we need to make sure that the path retains
        # the format {host}/{path}
        # here host is the container_name
        if ops.get("host", None):
            ops["path"] = ops["host"] + ops["path"]
        ops["path"] = ops["path"].lstrip("/")

        logging.debug(f"_strip_protocol({path}) = {ops}")
        return ops["path"]
Esempio n. 14
0
    def _open_dataset(self):
        import xarray as xr
        import fsspec
        assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6"
        from fsspec import filesystem, get_mapper
        from fsspec.utils import update_storage_options, infer_storage_options

        storage_options = infer_storage_options(self.urlpath)
        update_storage_options(storage_options, self.storage_options)
        self._fs = filesystem(storage_options['protocol'])
        if storage_options['protocol'] != 'file':
            self._mapper = get_mapper(self.urlpath)
            self._ds = xr.open_zarr(self._mapper, **self.kwargs)
        else:
            self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
Esempio n. 15
0
def sanitize_path(path):
    """Utility for cleaning up paths."""

    storage_option = infer_storage_options(path)

    protocol = storage_option['protocol']
    if protocol in ('http', 'https'):
        # Most FSs remove the protocol but not HTTPFS. We need to strip
        # it to match properly.
        path = os.path.normpath(path.replace("{}://".format(protocol), ''))
    elif protocol == 'file':
        # Remove trailing slashes from file paths.
        path = os.path.normpath(path)
        # Remove colons
        path = path.replace(':', '')
    # Otherwise we just make sure that path is posix
    return make_path_posix(path)
Esempio n. 16
0
    def __init__(self, **config):
        from fsspec.utils import infer_storage_options

        super().__init__(**config)

        self.url = config["url"]
        opts = infer_storage_options(self.url)

        if not opts["host"]:
            raise DvcException(
                "Empty GDrive URL '{}'. Learn more at {}".format(
                    config["url"],
                    format_link("https://man.dvc.org/remote/add"),
                )
            )

        self._bucket = opts["host"]
        self._path = opts["path"].lstrip("/")
        self._trash_only = config.get("gdrive_trash_only")
        self._use_service_account = config.get("gdrive_use_service_account")
        self._service_account_user_email = config.get(
            "gdrive_service_account_user_email"
        )
        self._service_account_json_file_path = config.get(
            "gdrive_service_account_json_file_path"
        )
        self._client_id = config.get("gdrive_client_id")
        self._client_secret = config.get("gdrive_client_secret")
        self._validate_config()

        tmp_dir = config["gdrive_credentials_tmp_dir"]
        assert tmp_dir

        self._gdrive_service_credentials_path = tmp_fname(
            os.path.join(tmp_dir, "")
        )
        self._gdrive_user_credentials_path = (
            tmp_fname(os.path.join(tmp_dir, ""))
            if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA)
            else config.get(
                "gdrive_user_credentials_file",
                os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE),
            )
        )
Esempio n. 17
0
    def __init__(self, filepath: str, credentials: Dict[str, Any] = None,) -> None:
        """
        Instantiate a ``Tool`` object, meant to save and load data

        Args:
            filepath (Path): Filepath to a file. May include protocol prefix,
                             e.g. `s3://`. With no prefix, assumes local filesystem.
                             Prefixes can include any protocol supported by ``fsspec``
            credentials (Dict[str, Any]): Credentials required to access to the
                                          filesystem as keys and values.
                                          e.g. {"my_token": "ABCD1234"}
        """
        storage_options = infer_storage_options(filepath)
        self.protocol = storage_options["protocol"]
        self.filepath = storage_options["path"]

        if not credentials:
            credentials = {}

        self.filesystem = fsspec.filesystem(protocol=self.protocol, **credentials)
Esempio n. 18
0
def create_presigned_url(filepath: str, expiration: int = 120) -> str:
    """
    Generate a pre-signed URL to share an S3 object

    Args:
        filepath (str): Path to the file to create a pre-signed URL to download
        expiration (int): Time in seconds before the link expires. Default 2 minutes.

    Returns:
        str: Pre-signed URL pointing to the file to be downloaded
    """

    # Generate a pre-signed URL for the S3 object
    storage_options = infer_storage_options(filepath)
    protocol = storage_options["protocol"]
    filepath = storage_options["path"]

    filesystem = fsspec.filesystem(protocol=protocol)

    # The response contains the pre-signed URL
    return filesystem.url(path=filepath, expires=expiration)
    def _get_kwargs_from_urls(path):
        opts = infer_storage_options(path)
        out = {"owner": opts["username"], "title": opts["password"]}
        if opts["host"]:
            out["model_pk"] = opts["host"]

        path = opts["path"]
        if len(path) > 0:
            parts = path[1:].split("/")
        else:
            parts = []

        if len(parts) > 0 and parts[0] in ("inputs", "outputs", "owner",
                                           "title"):
            out["resource"] = parts[0]

        if len(parts) > 1 and parts[1] in ("adjustment", "meta_parameters",
                                           "title"):
            out["field"] = parts[1]

        if len(parts) > 2 and parts[1] == "adjustment":
            out["section"] = parts[2]

        return out
Esempio n. 20
0
 def _strip_protocol(cls, path):
     ops = infer_storage_options(path)
     return ops["path"]
Esempio n. 21
0
 def _trim_filename(self, fn, **kwargs):
     """ Determine what kind of filestore this is and return the path """
     so = infer_storage_options(fn)
     fileparts = so["path"]
     return fileparts
Esempio n. 22
0
 def _get_kwargs_from_urls(urlpath):
     ops = infer_storage_options(urlpath)
     if "host" in ops:
         return {"bucket": ops["host"]}
     return {}
Esempio n. 23
0
def test_infer_options():
    so = infer_storage_options("/mnt/datasets/test.csv")
    assert so.pop("protocol") == "file"
    assert so.pop("path") == "/mnt/datasets/test.csv"
    assert not so

    assert infer_storage_options("./test.csv")["path"] == "./test.csv"
    assert infer_storage_options("../test.csv")["path"] == "../test.csv"

    so = infer_storage_options("C:\\test.csv")
    assert so.pop("protocol") == "file"
    assert so.pop("path") == "C:\\test.csv"
    assert not so

    assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv"
    assert infer_storage_options("\\test.csv")["path"] == "\\test.csv"
    assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv"
    assert infer_storage_options("test.csv")["path"] == "test.csv"

    so = infer_storage_options(
        "hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm",
        inherit_storage_options={"extra": "value"},
    )
    assert so.pop("protocol") == "hdfs"
    assert so.pop("username") == "username"
    assert so.pop("password") == "pwd"
    assert so.pop("host") == "Node"
    assert so.pop("port") == 123
    assert so.pop("path") == "/mnt/datasets/test.csv#fragm"
    assert so.pop("url_query") == "q=1"
    assert so.pop("url_fragment") == "fragm"
    assert so.pop("extra") == "value"
    assert not so

    so = infer_storage_options("hdfs://[email protected]/mnt/datasets/test.csv")
    assert so.pop("username") == "User-name"
    assert so.pop("host") == "Node-name.com"

    u = "http://127.0.0.1:8080/test.csv"
    assert infer_storage_options(u) == {"protocol": "http", "path": u}

    # For s3 and gcs the netloc is actually the bucket name, so we want to
    # include it in the path. Test that:
    # - Parsing doesn't lowercase the bucket
    # - The bucket is included in path
    for protocol in ["s3", "gcs", "gs"]:
        options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol)
        assert options["path"] == "Bucket-name.com/test.csv"

    with pytest.raises(KeyError):
        infer_storage_options("file:///bucket/file.csv", {"path": "collide"})
    with pytest.raises(KeyError):
        infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
Esempio n. 24
0
def test_infer_storage_options_c(urlpath, expected_path):
    so = infer_storage_options(urlpath)
    assert so["protocol"] == "file"
    assert so["path"] == expected_path
Esempio n. 25
0
def test_infer_simple():
    out = infer_storage_options("//mnt/datasets/test.csv")
    assert out["protocol"] == "file"
    assert out["path"] == "//mnt/datasets/test.csv"
    assert out.get("host", None) is None
Esempio n. 26
0
 def _strip_protocol(cls, path):
     ops = infer_storage_options(path)
     ops["path"] = ops["path"].lstrip("/")
     logging.debug(f"_strip_protocol:  {ops}")
     return ops["path"]
Esempio n. 27
0
File: azure.py Progetto: jhhuh/dvc
    def _strip_protocol(cls, path: str):
        opts = infer_storage_options(path)
        if opts.get("host"):
            return "{host}{path}".format(**opts)

        return _az_config().get("storage", "container_name", None)
Esempio n. 28
0
def _glob_path_to_fs(glob_path):
    inferred = infer_storage_options(glob_path)
    inferred.pop("path", None)
    return fsspec.filesystem(**inferred)
Esempio n. 29
0
    def __init__(  # pylint: disable=too-many-arguments
        self,
        path: str,
        dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]],
        filepath_arg: str = "filepath",
        filename_suffix: str = "",
        credentials: Dict[str, Any] = None,
        load_args: Dict[str, Any] = None,
        fs_args: Dict[str, Any] = None,
    ):
        """Creates a new instance of ``PartitionedDataSet``.

        Args:
            path: Path to the folder containing partitioned data.
                If path starts with the protocol (e.g., ``s3://``) then the
                corresponding ``fsspec`` concrete filesystem implementation will
                be used. If protocol is not specified,
                ``fsspec.implementations.local.LocalFileSystem`` will be used.
                **Note:** Some concrete implementations are bundled with ``fsspec``,
                while others (like ``s3`` or ``gcs``) must be installed separately
                prior to usage of the ``PartitionedDataSet``.
            dataset: Underlying dataset definition. This is used to instantiate
                the dataset for each file located inside the ``path``.
                Accepted formats are:
                a) object of a class that inherits from ``AbstractDataSet``
                b) a string representing a fully qualified class name to such class
                c) a dictionary with ``type`` key pointing to a string from b),
                other keys are passed to the Dataset initializer.
                Credentials for the dataset can be explicitly specified in
                this configuration.
            filepath_arg: Underlying dataset initializer argument that will
                contain a path to each corresponding partition file.
                If unspecified, defaults to "filepath".
            filename_suffix: If specified, only partitions that end with this
                string will be processed.
            credentials: Protocol-specific options that will be passed to
                ``fsspec.filesystem``
                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem
                and the dataset initializer. If the dataset config contains
                explicit credentials spec, then such spec will take precedence.
                All possible credentials management scenarios are documented here:
                https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials
            load_args: Keyword arguments to be passed into ``find()`` method of
                the filesystem implementation.
            fs_args: Extra arguments to pass into underlying filesystem class constructor
                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``)

        Raises:
            DataSetError: If versioning is enabled for the underlying dataset.
        """
        # pylint: disable=import-outside-toplevel
        from fsspec.utils import infer_storage_options  # for performance reasons

        super().__init__()

        self._path = path
        self._filename_suffix = filename_suffix
        self._protocol = infer_storage_options(self._path)["protocol"]
        self._partition_cache = Cache(maxsize=1)

        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
        self._dataset_type, self._dataset_config = parse_dataset_definition(
            dataset)
        if VERSION_KEY in self._dataset_config:
            raise DataSetError(
                "`{}` does not support versioning of the underlying dataset. "
                "Please remove `{}` flag from the dataset definition.".format(
                    self.__class__.__name__, VERSIONED_FLAG_KEY))

        if credentials:
            if CREDENTIALS_KEY in self._dataset_config:
                self._logger.warning(
                    KEY_PROPAGATION_WARNING,
                    {
                        "keys": CREDENTIALS_KEY,
                        "target": "underlying dataset"
                    },
                )
            else:
                self._dataset_config[CREDENTIALS_KEY] = deepcopy(credentials)

        self._credentials = deepcopy(credentials) or {}

        self._fs_args = deepcopy(fs_args) or {}
        if self._fs_args:
            if "fs_args" in self._dataset_config:
                self._logger.warning(
                    KEY_PROPAGATION_WARNING,
                    {
                        "keys": "filesystem arguments",
                        "target": "underlying dataset"
                    },
                )
            else:
                self._dataset_config["fs_args"] = deepcopy(self._fs_args)

        self._filepath_arg = filepath_arg
        if self._filepath_arg in self._dataset_config:
            warn(
                "`{}` key must not be specified in the dataset definition as it "
                "will be overwritten by partition path".format(
                    self._filepath_arg))

        self._load_args = deepcopy(load_args) or {}
        self._sep = self._filesystem.sep
        # since some filesystem implementations may implement a global cache
        self._invalidate_caches()
Esempio n. 30
0
    def _strip_protocol(cls, path: str) -> str:
        from fsspec.utils import infer_storage_options

        return infer_storage_options(path)["path"].lstrip("/")