def __new__( cls, data_set: Union[str, Dict], run_id: str = None, artifact_path: str = None, credentials: Dict[str, Any] = None, ): data_set, data_set_args = parse_dataset_definition(config=data_set) # fake inheritance : this mlflow class should be a mother class which wraps # all dataset (i.e. it should replace AbstractVersionedDataSet) # instead and since we can't modify the core package, # we create a subclass which inherits dynamically from the data_set class class MlflowArtifactDataSetChildren(data_set): def __init__(self, run_id, artifact_path): super().__init__(**data_set_args) self.run_id = run_id self.artifact_path = artifact_path def _save(self, data: Any): # _get_save_path needs to be called before super, otherwise # it will throw exception that file under path already exist. local_path = (self._get_save_path() if hasattr( self, "_version") else self._filepath) # it must be converted to a string with as_posix() # for logging on remote storage like Azure S3 local_path = local_path.as_posix() super()._save(data) if self.run_id: # if a run id is specified, we have to use mlflow client # to avoid potential conflicts with an already active run mlflow_client = MlflowClient() mlflow_client.log_artifact( run_id=self.run_id, local_path=local_path, artifact_path=self.artifact_path, ) else: mlflow.log_artifact(local_path, self.artifact_path) # rename the class parent_name = data_set.__name__ MlflowArtifactDataSetChildren.__name__ = f"Mlflow{parent_name}" MlflowArtifactDataSetChildren.__qualname__ = ( f"{parent_name}.Mlflow{parent_name}") mlflow_dataset_instance = MlflowArtifactDataSetChildren( run_id=run_id, artifact_path=artifact_path) return mlflow_dataset_instance
def __new__( cls, data_set: Union[str, Dict], run_id: str = None, artifact_path: str = None, credentials: Dict[str, Any] = None, ): data_set, data_set_args = parse_dataset_definition(config=data_set) # fake inheritance : this mlfow class should be a mother class which wraps # all dataset (i.e. it should replace AbstractVersionedDataSet) # instead and since we can't modify the core package, # we create a subclass which inherits dynamically from the data_set class class MlflowDataSetChildren(data_set): def __init__(self, run_id, artifact_path): super().__init__(**data_set_args) self.run_id = run_id self.artifact_path = artifact_path def _save(self, data: Any): super()._save(data) if self.run_id: # if a run id is specified, we have to use mlflow client # to avoid potential conflicts with an already active run mlflow_client = MlflowClient() mlflow_client.log_artifact( run_id=self.run_id, local_path=self._filepath, artifact_path=self.artifact_path, ) else: mlflow.log_artifact(self._filepath, self.artifact_path) # rename the class parent_name = data_set.__name__ MlflowDataSetChildren.__name__ = f"Mlflow{parent_name}" MlflowDataSetChildren.__qualname__ = f"{parent_name}.Mlflow{parent_name}" mlflow_dataset_instance = MlflowDataSetChildren( run_id=run_id, artifact_path=artifact_path) return mlflow_dataset_instance
def __init__( self, filepath: str, zipped_filename: str = None, zipped_filename_suffix: str = None, ignored_prefixes: str = None, ignored_suffixes: str = None, credentials: Dict[str, str] = None, dataset: Optional[Union[str, Type[AbstractDataSet], Dict[str, Any]]] = None, filepath_arg: str = 'filepath', ): if dataset is None: dataset = ZipFileDataSet.DEFAULT_DATASET dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition( dataset) if VERSION_KEY in self._dataset_config: raise DataSetError( "`{}` does not support versioning of the underlying dataset. " "Please remove `{}` flag from the dataset definition.".format( self.__class__.__name__, VERSIONED_FLAG_KEY)) self._filepath_arg = filepath_arg if self._filepath_arg in self._dataset_config: warn( "`{}` key must not be specified in the dataset definition as it " "will be overwritten by partition path".format( self._filepath_arg)) self._filepath = filepath self._zipped_filename = zipped_filename self._zipped_filename_suffix = zipped_filename_suffix self._ignored_prefixes = ignored_prefixes or ['_', '.'] self._ignored_suffixes = (ignored_suffixes or []) + ['/'] credentials = credentials or {} self._password = credentials.get('password', credentials.get('pwd'))
def __init__( # pylint: disable=too-many-arguments self, path: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], filepath_arg: str = "filepath", filename_suffix: str = "", credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ): """Creates a new instance of ``PartitionedDataSet``. Args: path: Path to the folder containing partitioned data. If path starts with the protocol (e.g., ``s3://``) then the corresponding ``fsspec`` concrete filesystem implementation will be used. If protocol is not specified, ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately prior to usage of the ``PartitionedDataSet``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: a) object of a class that inherits from ``AbstractDataSet`` b) a string representing a fully qualified class name to such class c) a dictionary with ``type`` key pointing to a string from b), other keys are passed to the Dataset initializer. Credentials for the dataset can be explicitly specified in this configuration. filepath_arg: Underlying dataset initializer argument that will contain a path to each corresponding partition file. If unspecified, defaults to "filepath". filename_suffix: If specified, only partitions that end with this string will be processed. credentials: Protocol-specific options that will be passed to ``fsspec.filesystem`` https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem and the dataset initializer. If the dataset config contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) Raises: DataSetError: If versioning is enabled for the underlying dataset. """ # pylint: disable=import-outside-toplevel from fsspec.utils import infer_storage_options # for performance reasons super().__init__() self._path = path self._filename_suffix = filename_suffix self._protocol = infer_storage_options(self._path)["protocol"] self._partition_cache = Cache(maxsize=1) dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition( dataset) if VERSION_KEY in self._dataset_config: raise DataSetError( "`{}` does not support versioning of the underlying dataset. " "Please remove `{}` flag from the dataset definition.".format( self.__class__.__name__, VERSIONED_FLAG_KEY)) if credentials: if CREDENTIALS_KEY in self._dataset_config: self._logger.warning( KEY_PROPAGATION_WARNING, { "keys": CREDENTIALS_KEY, "target": "underlying dataset" }, ) else: self._dataset_config[CREDENTIALS_KEY] = deepcopy(credentials) self._credentials = deepcopy(credentials) or {} self._fs_args = deepcopy(fs_args) or {} if self._fs_args: if "fs_args" in self._dataset_config: self._logger.warning( KEY_PROPAGATION_WARNING, { "keys": "filesystem arguments", "target": "underlying dataset" }, ) else: self._dataset_config["fs_args"] = deepcopy(self._fs_args) self._filepath_arg = filepath_arg if self._filepath_arg in self._dataset_config: warn( "`{}` key must not be specified in the dataset definition as it " "will be overwritten by partition path".format( self._filepath_arg)) self._load_args = deepcopy(load_args) or {} self._sep = self._filesystem.sep # since some filesystem implementations may implement a global cache self._invalidate_caches()
def _checkpoint(self) -> AbstractDataSet: type_, kwargs = parse_dataset_definition(self._checkpoint_config) return type_(**kwargs) # type: ignore
def __new__( cls, data_set: Union[str, Dict], run_id: str = None, artifact_path: str = None, credentials: Dict[str, Any] = None, ): data_set, data_set_args = parse_dataset_definition(config=data_set) # fake inheritance : this mlflow class should be a mother class which wraps # all dataset (i.e. it should replace AbstractVersionedDataSet) # instead and since we can't modify the core package, # we create a subclass which inherits dynamically from the data_set class class MlflowArtifactDataSetChildren(data_set): def __init__(self, run_id, artifact_path): super().__init__(**data_set_args) self.run_id = run_id self.artifact_path = artifact_path self._logging_activated = True @property def _logging_activated(self): return self.__logging_activated @_logging_activated.setter def _logging_activated(self, flag): if not isinstance(flag, bool): raise ValueError( f"_logging_activated must be a boolean, got {type(flag)}" ) self.__logging_activated = flag def _save(self, data: Any): # _get_save_path needs to be called before super, otherwise # it will throw exception that file under path already exist. if hasattr(self, "_version"): # all kedro datasets inherits from AbstractVersionedDataSet local_path = self._get_save_path() elif hasattr(self, "_filepath"): # in case custom datasets inherits from AbstractDataSet without versioning local_path = self._filepath # pragma: no cover elif hasattr(self, "_path"): # special datasets with a folder instead of a specifi files like PartitionedDataSet local_path = Path(self._path) # it must be converted to a string with as_posix() # for logging on remote storage like Azure S3 local_path = local_path.as_posix() super()._save(data) if self._logging_activated: if self.run_id: # if a run id is specified, we have to use mlflow client # to avoid potential conflicts with an already active run mlflow_client = MlflowClient() mlflow_client.log_artifact( run_id=self.run_id, local_path=local_path, artifact_path=self.artifact_path, ) else: mlflow.log_artifact(local_path, self.artifact_path) def _load(self) -> Any: # pragma: no cover if self.run_id: # if no run_id is specified, we take the artifact from the local path rather that the active run: # there are a lot of chances that it has not been saved yet! mlflow_client = MlflowClient() if hasattr(self, "_version"): # all kedro datasets inherits from AbstractVersionedDataSet local_path = self._get_load_path() elif hasattr(self, "_filepath"): # in case custom datasets inherits from AbstractDataSet without versioning local_path = self._filepath # pragma: no cover elif hasattr(self, "_path"): # special datasets with a folder instead of a specifi files like PartitionedDataSet local_path = Path(self._path) artifact_path = ((self.artifact_path / local_path.name).as_posix() if self.artifact_path else local_path.name) mlflow_client.download_artifacts( run_id=self.run_id, path=artifact_path, dst_path=local_path.parent.as_posix( ), # must be a **local** **directory** ) # finally, read locally return super()._load() # rename the class parent_name = data_set.__name__ MlflowArtifactDataSetChildren.__name__ = f"Mlflow{parent_name}" MlflowArtifactDataSetChildren.__qualname__ = ( f"{parent_name}.Mlflow{parent_name}") mlflow_dataset_instance = MlflowArtifactDataSetChildren( run_id=run_id, artifact_path=artifact_path) return mlflow_dataset_instance
def __init__( # pylint: disable=too-many-arguments self, path: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], filepath_arg: str = "filepath", filename_suffix: str = "", credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, ): """Creates a new instance of ``PartitionedDataSet``. Args: path: Path to the folder containing partitioned data. If path starts with the protocol (e.g., ``s3://``) then the corresponding ``fsspec`` concrete filesystem implementation will be used. If protocol is not specified, ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately prior to usage of the ``PartitionedDataSet``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: a) object of a class that inherits from ``AbstractDataSet`` b) a string representing a fully qualified class name to such class c) a dictionary with ``type`` key pointing to a string from b), other keys are passed to the Dataset initializer. **Note:** Credentials resolution is *not* currently supported for the underlying dataset definition. filepath_arg: Underlying dataset initializer argument that will contain a path to each corresponding partition file. If unspecified, defaults to "filepath". filename_suffix: If specified, only partitions that end with this string will be processed. credentials: Protocol-specific options that will be passed to ``fsspec.filesystem`` call: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem _and_ also to the underlying dataset initializer. If ``dataset_credentials`` key is present in this dictionary, then only its value will be passed to the dataset initializer ``credentials`` argument instead of the copy of the entire dictionary. Example 1: If ``credentials = {"k1": "secret1"}``, then filesystem is called as ``filesystem(..., k1="secret1")``, the dataset is instantiated as ``dataset_class(..., credentials={"k1": "secret1"})``. Example 2: If ``credentials = {"k1": "secret1", "dataset_credentials": {"k2": "secret2"}}``, then filesystem is called as ``filesystem(..., k1="secret1")``, the dataset is instantiated as ``dataset_class(..., credentials={"k2": "secret2"})``. Example 3: If ``credentials = {"dataset_credentials": {"k2": "secret2"}}``, then credentials are not passed to the filesystem call, the dataset is instantiated as ``dataset_class(..., credentials={"k2": "secret2"})``. Example 4: If ``credentials = {"k1": "secret1", "dataset_credentials": None}``, then filesystem is called as ``filesystem(..., k1="secret1")``, credentials are not passed to the dataset initializer. load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. Raises: DataSetError: If versioning is enabled for the underlying dataset. """ super().__init__() self._path = path self._filename_suffix = filename_suffix self._protocol = infer_storage_options(self._path)["protocol"] dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition(dataset) if VERSION_KEY in self._dataset_config: raise DataSetError( "`{}` does not support versioning of the underlying dataset. " "Please remove `{}` flag from the dataset definition.".format( self.__class__.__name__, VERSIONED_FLAG_KEY ) ) if CREDENTIALS_KEY in self._dataset_config: raise DataSetError( "Credentials for the underlying dataset must not be specified " "explicitly in dataset configuration. Please put those under " "`dataset_credentials` key in a dictionary and pass as " "`credentials` argument to {} initializer.".format( self.__class__.__name__ ) ) self._credentials, dataset_credentials = _split_credentials(credentials) if dataset_credentials: self._dataset_config[CREDENTIALS_KEY] = dataset_credentials self._filepath_arg = filepath_arg if self._filepath_arg in self._dataset_config: warn( "`{}` key must not be specified in the dataset definition as it " "will be overwritten by partition path".format(self._filepath_arg) ) self._load_args = deepcopy(load_args) or {} self._sep = self._filesystem.sep # since some filesystem implementations may implement a global cache self.invalidate_cache()