def get_remote_filesystem_and_path( protocol: str, uri: str, path: str, **storage_options: Dict[str, Any]) -> Tuple[AbstractFileSystem, str]: """ For a given protocol, root uri, path and kwargs returns the appropriate FileSystem representation and the representation of path on this FileSystem required to access path. :param protocol: protocol of the target FileSystem, e.g. https, file, github :param uri: uri of the root of the FileSystem, from where path is expected to join :param path: path to the location in the context of the uri :param storage_options: arguments passed through to the FileSystem instantiation :return: The FileSystem and access path on this FileSystem """ if protocol == "file": storage_options.setdefault("auto_mkdir", True) uri = Path(uri.replace("file://", "")) / Path(path) return LocalFileSystem(**storage_options), uri.as_posix() elif protocol in {"http", "https", "s3"}: # storage_options are parameters passed to request path = urllib.parse.quote(path) uri = "/".join(s.strip("/") for s in [uri, path]) fs_class = S3FileSystem if protocol == "s3" else HTTPFileSystem return fs_class(**storage_options), uri elif protocol in {"sftp", "ssh", "ftp"}: inferred_options = infer_storage_options(uri) username = storage_options.pop( "username", None) or inferred_options.get("username") password = storage_options.pop( "password", None) or inferred_options.get("password") uri = (Path(inferred_options["path"]) / Path(path)).as_posix() fs_class = FTPFileSystem if protocol == "ftp" else SFTPFileSystem fs = fs_class(host=inferred_options["host"], username=username, password=password, **storage_options) if protocol == "ftp": try: fs.ftp.dir() except (TimeoutError, socket.timeout): fs.ftp.set_pasv(False) return fs, uri elif protocol == "github": if re.match(r"\w+/\w+", uri): uri = f"github://{uri.split('/')[0]}:{uri.split('/')[1]}@master/" # infer options on a github uri reads the org, repo and sha incorrectly (as if it were an ftp uri) # this is because it uses urllib.parse.urlsplit under the hood inferred_options = infer_storage_options(uri) org = inferred_options.get("username") repo = inferred_options.get("password") sha = inferred_options.get("host") or "master" path = (Path(inferred_options.get("path", "")) / Path(path)).as_posix() return GithubFileSystem(org=org, repo=repo, sha=sha, **storage_options), path else: raise NotImplementedError( f"Unsupported remote filesystem {protocol}:{uri}")
def wrap(self, url): """Wrap the normal URL in the form of azure://{url}/{something} to remote://{remote}/{something}""" from fsspec.utils import infer_storage_options url_params = infer_storage_options(url) remote_url_params = infer_storage_options(self.remote_options["base"]) url = url_params["host"] + url_params["path"] remote_url = remote_url_params["host"] + remote_url_params["path"] if url.startswith(remote_url): return f"remote://{self.DEFAULT_REMOTE}/{url[len(remote_url):]}" else: raise ValueError(f"url {url!r} doesn't match with {remote_url!r}")
def _get_kwargs_from_urls(urlpath): from fsspec.utils import infer_storage_options options = infer_storage_options(urlpath) options.pop("path", None) options.pop("protocol", None) return options
def _strip_protocol(cls, path: str): bucket = infer_storage_options(path).get("host") if bucket: return path bucket = _az_config().get("storage", "container_name", None) return f"azure://{bucket}"
def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, str]: """Parses filepath on protocol and path. Args: filepath: raw filepath e.g.: `gcs://bucket/test.json`. version: instance of ``kedro.io.core.Version`` or None. Returns: Protocol and path. Raises: DataSetError: when protocol is http(s) and version is not None. Note: HTTP(s) dataset doesn't support versioning. """ options_dict = infer_storage_options(filepath) path = options_dict["path"] protocol = options_dict["protocol"] if protocol in HTTP_PROTOCOLS: if version: raise DataSetError( "HTTP(s) DataSet doesn't support versioning. " "Please remove version flag from the dataset configuration.") path = path.split(PROTOCOL_DELIMITER, 1)[-1] return protocol, path
def _strip_protocol(cls, path: str): """ Remove the protocol from the input path Parameters ---------- path: str Path to remove the protocol from Returns ------- str Returns a path without the protocol """ logging.debug(f"_strip_protocol for {path}") ops = infer_storage_options(path) # we need to make sure that the path retains # the format {host}/{path} # here host is the container_name if ops.get("host", None): ops["path"] = ops["host"] + ops["path"] ops["path"] = ops["path"].lstrip("/") logging.debug(f"_strip_protocol({path}) = {ops}") return ops["path"]
def _get_kwargs_from_urls(paths): """ Get the store_name from the urlpath and pass to storage_options """ ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["store_name"] = ops["host"] return out
def _prepare_file_arg( file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any ) -> ContextManager[Union[str, BinaryIO, List[str], List[BinaryIO]]]: """ Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]). Returned value is always usable as a context. A `StringIO`, `BytesIO` file is returned as a `BytesIO`. A local path is returned as a string. An http URL is read into a buffer and returned as a `BytesIO`. When fsspec is installed, remote file(s) is (are) opened with `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`. """ # Small helper to use a variable as context @contextmanager def managed_file(file: Any) -> Iterator[Any]: try: yield file finally: pass if isinstance(file, StringIO): return BytesIO(file.read().encode("utf8")) if isinstance(file, BytesIO): return managed_file(file) if isinstance(file, Path): return managed_file(format_path(file)) if isinstance(file, str): if _WITH_FSSPEC: if infer_storage_options(file)["protocol"] == "file": return managed_file(format_path(file)) return fsspec.open(file, **kwargs) if file.startswith("http"): return _process_http_file(file) if isinstance(file, list) and bool(file) and all( isinstance(f, str) for f in file): if _WITH_FSSPEC: if all( infer_storage_options(f)["protocol"] == "file" for f in file): return managed_file([format_path(f) for f in file]) return fsspec.open_files(file, **kwargs) if isinstance(file, str): file = format_path(file) return managed_file(file)
def _strip_protocol(cls, path): ops = infer_storage_options(path) path = ops["path"] # infer_store_options leaves file:/ prefixes alone # for local hdfs instances if path.startswith("file:"): path = path[5:] return path
def _get_kwargs_from_urls(paths): """ Get the store_name from the urlpath and pass to storage_options """ logging.debug("Getting kwargs from urls...") ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["container_name"] = ops["host"] logging.debug(f"kwargs are: {out}") return out
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file on a specific filesystem. Args: filepath: Filepath to a JSON file prefixed with a protocol like `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec`` except `http(s)`. load_args: Pandas options for loading JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html All defaults are preserved. save_args: Pandas options for saving JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. https://cloud.google.com/resource-manager/docs/creating-managing-projects credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class. E.g. for ``GCSFileSystem`` class: `{project: 'my-project', ...}` """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} options_dict = infer_storage_options(filepath) self._protocol = options_dict["protocol"] self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) super().__init__( filepath=PurePosixPath(options_dict["path"]), version=version, exists_function=self._fs.exists, glob_function=self._fs.glob, ) # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def _get_kwargs_from_urls(path): ops = infer_storage_options(path) out = {} if ops.get("host", None): out["host"] = ops["host"] if ops.get("username", None): out["user"] = ops["username"] if ops.get("port", None): out["port"] = ops["port"] return out
def _strip_protocol(cls, path): ops = infer_storage_options(path) # we need to make sure that the path retains # the format {host}/{path} # here host is the container_name if ops.get("host", None): ops["path"] = ops["host"] + ops["path"] ops["path"] = ops["path"].lstrip("/") logging.debug(f"_strip_protocol({path}) = {ops}") return ops["path"]
def _open_dataset(self): import xarray as xr import fsspec assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6" from fsspec import filesystem, get_mapper from fsspec.utils import update_storage_options, infer_storage_options storage_options = infer_storage_options(self.urlpath) update_storage_options(storage_options, self.storage_options) self._fs = filesystem(storage_options['protocol']) if storage_options['protocol'] != 'file': self._mapper = get_mapper(self.urlpath) self._ds = xr.open_zarr(self._mapper, **self.kwargs) else: self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
def sanitize_path(path): """Utility for cleaning up paths.""" storage_option = infer_storage_options(path) protocol = storage_option['protocol'] if protocol in ('http', 'https'): # Most FSs remove the protocol but not HTTPFS. We need to strip # it to match properly. path = os.path.normpath(path.replace("{}://".format(protocol), '')) elif protocol == 'file': # Remove trailing slashes from file paths. path = os.path.normpath(path) # Remove colons path = path.replace(':', '') # Otherwise we just make sure that path is posix return make_path_posix(path)
def __init__(self, **config): from fsspec.utils import infer_storage_options super().__init__(**config) self.url = config["url"] opts = infer_storage_options(self.url) if not opts["host"]: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), ) ) self._bucket = opts["host"] self._path = opts["path"].lstrip("/") self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_user_email = config.get( "gdrive_service_account_user_email" ) self._service_account_json_file_path = config.get( "gdrive_service_account_json_file_path" ) self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() tmp_dir = config["gdrive_credentials_tmp_dir"] assert tmp_dir self._gdrive_service_credentials_path = tmp_fname( os.path.join(tmp_dir, "") ) self._gdrive_user_credentials_path = ( tmp_fname(os.path.join(tmp_dir, "")) if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ) )
def __init__(self, filepath: str, credentials: Dict[str, Any] = None,) -> None: """ Instantiate a ``Tool`` object, meant to save and load data Args: filepath (Path): Filepath to a file. May include protocol prefix, e.g. `s3://`. With no prefix, assumes local filesystem. Prefixes can include any protocol supported by ``fsspec`` credentials (Dict[str, Any]): Credentials required to access to the filesystem as keys and values. e.g. {"my_token": "ABCD1234"} """ storage_options = infer_storage_options(filepath) self.protocol = storage_options["protocol"] self.filepath = storage_options["path"] if not credentials: credentials = {} self.filesystem = fsspec.filesystem(protocol=self.protocol, **credentials)
def create_presigned_url(filepath: str, expiration: int = 120) -> str: """ Generate a pre-signed URL to share an S3 object Args: filepath (str): Path to the file to create a pre-signed URL to download expiration (int): Time in seconds before the link expires. Default 2 minutes. Returns: str: Pre-signed URL pointing to the file to be downloaded """ # Generate a pre-signed URL for the S3 object storage_options = infer_storage_options(filepath) protocol = storage_options["protocol"] filepath = storage_options["path"] filesystem = fsspec.filesystem(protocol=protocol) # The response contains the pre-signed URL return filesystem.url(path=filepath, expires=expiration)
def _get_kwargs_from_urls(path): opts = infer_storage_options(path) out = {"owner": opts["username"], "title": opts["password"]} if opts["host"]: out["model_pk"] = opts["host"] path = opts["path"] if len(path) > 0: parts = path[1:].split("/") else: parts = [] if len(parts) > 0 and parts[0] in ("inputs", "outputs", "owner", "title"): out["resource"] = parts[0] if len(parts) > 1 and parts[1] in ("adjustment", "meta_parameters", "title"): out["field"] = parts[1] if len(parts) > 2 and parts[1] == "adjustment": out["section"] = parts[2] return out
def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops["path"]
def _trim_filename(self, fn, **kwargs): """ Determine what kind of filestore this is and return the path """ so = infer_storage_options(fn) fileparts = so["path"] return fileparts
def _get_kwargs_from_urls(urlpath): ops = infer_storage_options(urlpath) if "host" in ops: return {"bucket": ops["host"]} return {}
def test_infer_options(): so = infer_storage_options("/mnt/datasets/test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "/mnt/datasets/test.csv" assert not so assert infer_storage_options("./test.csv")["path"] == "./test.csv" assert infer_storage_options("../test.csv")["path"] == "../test.csv" so = infer_storage_options("C:\\test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "C:\\test.csv" assert not so assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" assert infer_storage_options("test.csv")["path"] == "test.csv" so = infer_storage_options( "hdfs://*****:*****@Node:123/mnt/datasets/test.csv?q=1#fragm", inherit_storage_options={"extra": "value"}, ) assert so.pop("protocol") == "hdfs" assert so.pop("username") == "username" assert so.pop("password") == "pwd" assert so.pop("host") == "Node" assert so.pop("port") == 123 assert so.pop("path") == "/mnt/datasets/test.csv#fragm" assert so.pop("url_query") == "q=1" assert so.pop("url_fragment") == "fragm" assert so.pop("extra") == "value" assert not so so = infer_storage_options("hdfs://[email protected]/mnt/datasets/test.csv") assert so.pop("username") == "User-name" assert so.pop("host") == "Node-name.com" u = "http://127.0.0.1:8080/test.csv" assert infer_storage_options(u) == {"protocol": "http", "path": u} # For s3 and gcs the netloc is actually the bucket name, so we want to # include it in the path. Test that: # - Parsing doesn't lowercase the bucket # - The bucket is included in path for protocol in ["s3", "gcs", "gs"]: options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol) assert options["path"] == "Bucket-name.com/test.csv" with pytest.raises(KeyError): infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) with pytest.raises(KeyError): infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
def test_infer_storage_options_c(urlpath, expected_path): so = infer_storage_options(urlpath) assert so["protocol"] == "file" assert so["path"] == expected_path
def test_infer_simple(): out = infer_storage_options("//mnt/datasets/test.csv") assert out["protocol"] == "file" assert out["path"] == "//mnt/datasets/test.csv" assert out.get("host", None) is None
def _strip_protocol(cls, path): ops = infer_storage_options(path) ops["path"] = ops["path"].lstrip("/") logging.debug(f"_strip_protocol: {ops}") return ops["path"]
def _strip_protocol(cls, path: str): opts = infer_storage_options(path) if opts.get("host"): return "{host}{path}".format(**opts) return _az_config().get("storage", "container_name", None)
def _glob_path_to_fs(glob_path): inferred = infer_storage_options(glob_path) inferred.pop("path", None) return fsspec.filesystem(**inferred)
def __init__( # pylint: disable=too-many-arguments self, path: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], filepath_arg: str = "filepath", filename_suffix: str = "", credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ): """Creates a new instance of ``PartitionedDataSet``. Args: path: Path to the folder containing partitioned data. If path starts with the protocol (e.g., ``s3://``) then the corresponding ``fsspec`` concrete filesystem implementation will be used. If protocol is not specified, ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately prior to usage of the ``PartitionedDataSet``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: a) object of a class that inherits from ``AbstractDataSet`` b) a string representing a fully qualified class name to such class c) a dictionary with ``type`` key pointing to a string from b), other keys are passed to the Dataset initializer. Credentials for the dataset can be explicitly specified in this configuration. filepath_arg: Underlying dataset initializer argument that will contain a path to each corresponding partition file. If unspecified, defaults to "filepath". filename_suffix: If specified, only partitions that end with this string will be processed. credentials: Protocol-specific options that will be passed to ``fsspec.filesystem`` https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem and the dataset initializer. If the dataset config contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) Raises: DataSetError: If versioning is enabled for the underlying dataset. """ # pylint: disable=import-outside-toplevel from fsspec.utils import infer_storage_options # for performance reasons super().__init__() self._path = path self._filename_suffix = filename_suffix self._protocol = infer_storage_options(self._path)["protocol"] self._partition_cache = Cache(maxsize=1) dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition( dataset) if VERSION_KEY in self._dataset_config: raise DataSetError( "`{}` does not support versioning of the underlying dataset. " "Please remove `{}` flag from the dataset definition.".format( self.__class__.__name__, VERSIONED_FLAG_KEY)) if credentials: if CREDENTIALS_KEY in self._dataset_config: self._logger.warning( KEY_PROPAGATION_WARNING, { "keys": CREDENTIALS_KEY, "target": "underlying dataset" }, ) else: self._dataset_config[CREDENTIALS_KEY] = deepcopy(credentials) self._credentials = deepcopy(credentials) or {} self._fs_args = deepcopy(fs_args) or {} if self._fs_args: if "fs_args" in self._dataset_config: self._logger.warning( KEY_PROPAGATION_WARNING, { "keys": "filesystem arguments", "target": "underlying dataset" }, ) else: self._dataset_config["fs_args"] = deepcopy(self._fs_args) self._filepath_arg = filepath_arg if self._filepath_arg in self._dataset_config: warn( "`{}` key must not be specified in the dataset definition as it " "will be overwritten by partition path".format( self._filepath_arg)) self._load_args = deepcopy(load_args) or {} self._sep = self._filesystem.sep # since some filesystem implementations may implement a global cache self._invalidate_caches()
def _strip_protocol(cls, path: str) -> str: from fsspec.utils import infer_storage_options return infer_storage_options(path)["path"].lstrip("/")