def __new__(
        cls,
        data_set: Union[str, Dict],
        run_id: str = None,
        artifact_path: str = None,
        credentials: Dict[str, Any] = None,
    ):

        data_set, data_set_args = parse_dataset_definition(config=data_set)

        # fake inheritance : this mlflow class should be a mother class which wraps
        # all dataset (i.e. it should replace AbstractVersionedDataSet)
        # instead and since we can't modify the core package,
        # we create a subclass which inherits dynamically from the data_set class
        class MlflowArtifactDataSetChildren(data_set):
            def __init__(self, run_id, artifact_path):
                super().__init__(**data_set_args)
                self.run_id = run_id
                self.artifact_path = artifact_path

            def _save(self, data: Any):
                # _get_save_path needs to be called before super, otherwise
                # it will throw exception that file under path already exist.
                local_path = (self._get_save_path() if hasattr(
                    self, "_version") else self._filepath)
                # it must be converted to a string with as_posix()
                # for logging on remote storage like Azure S3
                local_path = local_path.as_posix()

                super()._save(data)
                if self.run_id:
                    # if a run id is specified, we have to use mlflow client
                    # to avoid potential conflicts with an already active run
                    mlflow_client = MlflowClient()
                    mlflow_client.log_artifact(
                        run_id=self.run_id,
                        local_path=local_path,
                        artifact_path=self.artifact_path,
                    )
                else:
                    mlflow.log_artifact(local_path, self.artifact_path)

        # rename the class
        parent_name = data_set.__name__
        MlflowArtifactDataSetChildren.__name__ = f"Mlflow{parent_name}"
        MlflowArtifactDataSetChildren.__qualname__ = (
            f"{parent_name}.Mlflow{parent_name}")

        mlflow_dataset_instance = MlflowArtifactDataSetChildren(
            run_id=run_id, artifact_path=artifact_path)
        return mlflow_dataset_instance
    def __new__(
        cls,
        data_set: Union[str, Dict],
        run_id: str = None,
        artifact_path: str = None,
        credentials: Dict[str, Any] = None,
    ):

        data_set, data_set_args = parse_dataset_definition(config=data_set)

        # fake inheritance : this mlfow class should be a mother class which wraps
        # all dataset (i.e. it should replace AbstractVersionedDataSet)
        # instead and since we can't modify the core package,
        # we create a subclass which inherits dynamically from the data_set class
        class MlflowDataSetChildren(data_set):
            def __init__(self, run_id, artifact_path):
                super().__init__(**data_set_args)
                self.run_id = run_id
                self.artifact_path = artifact_path

            def _save(self, data: Any):
                super()._save(data)
                if self.run_id:
                    # if a run id is specified, we have to use mlflow client
                    # to avoid potential conflicts with an already active run
                    mlflow_client = MlflowClient()
                    mlflow_client.log_artifact(
                        run_id=self.run_id,
                        local_path=self._filepath,
                        artifact_path=self.artifact_path,
                    )
                else:
                    mlflow.log_artifact(self._filepath, self.artifact_path)

        # rename the class
        parent_name = data_set.__name__
        MlflowDataSetChildren.__name__ = f"Mlflow{parent_name}"
        MlflowDataSetChildren.__qualname__ = f"{parent_name}.Mlflow{parent_name}"

        mlflow_dataset_instance = MlflowDataSetChildren(
            run_id=run_id, artifact_path=artifact_path)
        return mlflow_dataset_instance
    def __init__(
        self,
        filepath: str,
        zipped_filename: str = None,
        zipped_filename_suffix: str = None,
        ignored_prefixes: str = None,
        ignored_suffixes: str = None,
        credentials: Dict[str, str] = None,
        dataset: Optional[Union[str, Type[AbstractDataSet], Dict[str,
                                                                 Any]]] = None,
        filepath_arg: str = 'filepath',
    ):

        if dataset is None:
            dataset = ZipFileDataSet.DEFAULT_DATASET

        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
        self._dataset_type, self._dataset_config = parse_dataset_definition(
            dataset)
        if VERSION_KEY in self._dataset_config:
            raise DataSetError(
                "`{}` does not support versioning of the underlying dataset. "
                "Please remove `{}` flag from the dataset definition.".format(
                    self.__class__.__name__, VERSIONED_FLAG_KEY))

        self._filepath_arg = filepath_arg
        if self._filepath_arg in self._dataset_config:
            warn(
                "`{}` key must not be specified in the dataset definition as it "
                "will be overwritten by partition path".format(
                    self._filepath_arg))

        self._filepath = filepath
        self._zipped_filename = zipped_filename
        self._zipped_filename_suffix = zipped_filename_suffix
        self._ignored_prefixes = ignored_prefixes or ['_', '.']
        self._ignored_suffixes = (ignored_suffixes or []) + ['/']
        credentials = credentials or {}
        self._password = credentials.get('password', credentials.get('pwd'))
Beispiel #4
0
    def __init__(  # pylint: disable=too-many-arguments
        self,
        path: str,
        dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]],
        filepath_arg: str = "filepath",
        filename_suffix: str = "",
        credentials: Dict[str, Any] = None,
        load_args: Dict[str, Any] = None,
        fs_args: Dict[str, Any] = None,
    ):
        """Creates a new instance of ``PartitionedDataSet``.

        Args:
            path: Path to the folder containing partitioned data.
                If path starts with the protocol (e.g., ``s3://``) then the
                corresponding ``fsspec`` concrete filesystem implementation will
                be used. If protocol is not specified,
                ``fsspec.implementations.local.LocalFileSystem`` will be used.
                **Note:** Some concrete implementations are bundled with ``fsspec``,
                while others (like ``s3`` or ``gcs``) must be installed separately
                prior to usage of the ``PartitionedDataSet``.
            dataset: Underlying dataset definition. This is used to instantiate
                the dataset for each file located inside the ``path``.
                Accepted formats are:
                a) object of a class that inherits from ``AbstractDataSet``
                b) a string representing a fully qualified class name to such class
                c) a dictionary with ``type`` key pointing to a string from b),
                other keys are passed to the Dataset initializer.
                Credentials for the dataset can be explicitly specified in
                this configuration.
            filepath_arg: Underlying dataset initializer argument that will
                contain a path to each corresponding partition file.
                If unspecified, defaults to "filepath".
            filename_suffix: If specified, only partitions that end with this
                string will be processed.
            credentials: Protocol-specific options that will be passed to
                ``fsspec.filesystem``
                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem
                and the dataset initializer. If the dataset config contains
                explicit credentials spec, then such spec will take precedence.
                All possible credentials management scenarios are documented here:
                https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials
            load_args: Keyword arguments to be passed into ``find()`` method of
                the filesystem implementation.
            fs_args: Extra arguments to pass into underlying filesystem class constructor
                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``)

        Raises:
            DataSetError: If versioning is enabled for the underlying dataset.
        """
        # pylint: disable=import-outside-toplevel
        from fsspec.utils import infer_storage_options  # for performance reasons

        super().__init__()

        self._path = path
        self._filename_suffix = filename_suffix
        self._protocol = infer_storage_options(self._path)["protocol"]
        self._partition_cache = Cache(maxsize=1)

        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
        self._dataset_type, self._dataset_config = parse_dataset_definition(
            dataset)
        if VERSION_KEY in self._dataset_config:
            raise DataSetError(
                "`{}` does not support versioning of the underlying dataset. "
                "Please remove `{}` flag from the dataset definition.".format(
                    self.__class__.__name__, VERSIONED_FLAG_KEY))

        if credentials:
            if CREDENTIALS_KEY in self._dataset_config:
                self._logger.warning(
                    KEY_PROPAGATION_WARNING,
                    {
                        "keys": CREDENTIALS_KEY,
                        "target": "underlying dataset"
                    },
                )
            else:
                self._dataset_config[CREDENTIALS_KEY] = deepcopy(credentials)

        self._credentials = deepcopy(credentials) or {}

        self._fs_args = deepcopy(fs_args) or {}
        if self._fs_args:
            if "fs_args" in self._dataset_config:
                self._logger.warning(
                    KEY_PROPAGATION_WARNING,
                    {
                        "keys": "filesystem arguments",
                        "target": "underlying dataset"
                    },
                )
            else:
                self._dataset_config["fs_args"] = deepcopy(self._fs_args)

        self._filepath_arg = filepath_arg
        if self._filepath_arg in self._dataset_config:
            warn(
                "`{}` key must not be specified in the dataset definition as it "
                "will be overwritten by partition path".format(
                    self._filepath_arg))

        self._load_args = deepcopy(load_args) or {}
        self._sep = self._filesystem.sep
        # since some filesystem implementations may implement a global cache
        self._invalidate_caches()
Beispiel #5
0
 def _checkpoint(self) -> AbstractDataSet:
     type_, kwargs = parse_dataset_definition(self._checkpoint_config)
     return type_(**kwargs)  # type: ignore
    def __new__(
        cls,
        data_set: Union[str, Dict],
        run_id: str = None,
        artifact_path: str = None,
        credentials: Dict[str, Any] = None,
    ):

        data_set, data_set_args = parse_dataset_definition(config=data_set)

        # fake inheritance : this mlflow class should be a mother class which wraps
        # all dataset (i.e. it should replace AbstractVersionedDataSet)
        # instead and since we can't modify the core package,
        # we create a subclass which inherits dynamically from the data_set class
        class MlflowArtifactDataSetChildren(data_set):
            def __init__(self, run_id, artifact_path):
                super().__init__(**data_set_args)
                self.run_id = run_id
                self.artifact_path = artifact_path
                self._logging_activated = True

            @property
            def _logging_activated(self):
                return self.__logging_activated

            @_logging_activated.setter
            def _logging_activated(self, flag):
                if not isinstance(flag, bool):
                    raise ValueError(
                        f"_logging_activated must be a boolean, got {type(flag)}"
                    )
                self.__logging_activated = flag

            def _save(self, data: Any):

                # _get_save_path needs to be called before super, otherwise
                # it will throw exception that file under path already exist.
                if hasattr(self, "_version"):
                    # all kedro datasets inherits from AbstractVersionedDataSet
                    local_path = self._get_save_path()
                elif hasattr(self, "_filepath"):
                    # in case custom datasets inherits from AbstractDataSet without versioning
                    local_path = self._filepath  # pragma: no cover
                elif hasattr(self, "_path"):
                    # special datasets with a folder instead of a specifi files like PartitionedDataSet
                    local_path = Path(self._path)

                # it must be converted to a string with as_posix()
                # for logging on remote storage like Azure S3
                local_path = local_path.as_posix()

                super()._save(data)

                if self._logging_activated:
                    if self.run_id:
                        # if a run id is specified, we have to use mlflow client
                        # to avoid potential conflicts with an already active run
                        mlflow_client = MlflowClient()
                        mlflow_client.log_artifact(
                            run_id=self.run_id,
                            local_path=local_path,
                            artifact_path=self.artifact_path,
                        )
                    else:
                        mlflow.log_artifact(local_path, self.artifact_path)

            def _load(self) -> Any:  # pragma: no cover
                if self.run_id:
                    # if no run_id is specified, we take the artifact from the local path rather that the active run:
                    # there are a lot of chances that it has not been saved yet!

                    mlflow_client = MlflowClient()

                    if hasattr(self, "_version"):
                        # all kedro datasets inherits from AbstractVersionedDataSet
                        local_path = self._get_load_path()
                    elif hasattr(self, "_filepath"):
                        # in case custom datasets inherits from AbstractDataSet without versioning
                        local_path = self._filepath  # pragma: no cover
                    elif hasattr(self, "_path"):
                        # special datasets with a folder instead of a specifi files like PartitionedDataSet
                        local_path = Path(self._path)

                    artifact_path = ((self.artifact_path /
                                      local_path.name).as_posix() if
                                     self.artifact_path else local_path.name)

                    mlflow_client.download_artifacts(
                        run_id=self.run_id,
                        path=artifact_path,
                        dst_path=local_path.parent.as_posix(
                        ),  # must be a **local** **directory**
                    )

                # finally, read locally
                return super()._load()

        # rename the class
        parent_name = data_set.__name__
        MlflowArtifactDataSetChildren.__name__ = f"Mlflow{parent_name}"
        MlflowArtifactDataSetChildren.__qualname__ = (
            f"{parent_name}.Mlflow{parent_name}")

        mlflow_dataset_instance = MlflowArtifactDataSetChildren(
            run_id=run_id, artifact_path=artifact_path)
        return mlflow_dataset_instance
Beispiel #7
0
    def __init__(  # pylint: disable=too-many-arguments
        self,
        path: str,
        dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]],
        filepath_arg: str = "filepath",
        filename_suffix: str = "",
        credentials: Dict[str, Any] = None,
        load_args: Dict[str, Any] = None,
    ):
        """Creates a new instance of ``PartitionedDataSet``.

        Args:
            path: Path to the folder containing partitioned data.
                If path starts with the protocol (e.g., ``s3://``) then the
                corresponding ``fsspec`` concrete filesystem implementation will
                be used. If protocol is not specified,
                ``fsspec.implementations.local.LocalFileSystem`` will be used.
                **Note:** Some concrete implementations are bundled with ``fsspec``,
                while others (like ``s3`` or ``gcs``) must be installed separately
                prior to usage of the ``PartitionedDataSet``.
            dataset: Underlying dataset definition. This is used to instantiate
                the dataset for each file located inside the ``path``.
                Accepted formats are:
                a) object of a class that inherits from ``AbstractDataSet``
                b) a string representing a fully qualified class name to such class
                c) a dictionary with ``type`` key pointing to a string from b),
                other keys are passed to the Dataset initializer.
                **Note:** Credentials resolution is *not* currently supported
                for the underlying dataset definition.
            filepath_arg: Underlying dataset initializer argument that will
                contain a path to each corresponding partition file.
                If unspecified, defaults to "filepath".
            filename_suffix: If specified, only partitions that end with this
                string will be processed.
            credentials: Protocol-specific options that will be passed to
                ``fsspec.filesystem`` call:
                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem
                _and_ also to the underlying dataset initializer. If
                ``dataset_credentials`` key is present in this dictionary, then
                only its value will be passed to the dataset initializer ``credentials``
                argument instead of the copy of the entire dictionary.

                Example 1: If ``credentials = {"k1": "secret1"}``, then filesystem
                    is called as ``filesystem(..., k1="secret1")``, the dataset is
                    instantiated as
                    ``dataset_class(..., credentials={"k1": "secret1"})``.
                Example 2: If
                    ``credentials = {"k1": "secret1", "dataset_credentials": {"k2": "secret2"}}``,
                    then filesystem is called as ``filesystem(..., k1="secret1")``,
                    the dataset is instantiated as
                    ``dataset_class(..., credentials={"k2": "secret2"})``.
                Example 3: If
                    ``credentials = {"dataset_credentials": {"k2": "secret2"}}``,
                    then credentials are not passed to the filesystem call, the dataset
                    is instantiated as
                    ``dataset_class(..., credentials={"k2": "secret2"})``.
                Example 4: If
                    ``credentials = {"k1": "secret1", "dataset_credentials": None}``,
                    then filesystem is called as ``filesystem(..., k1="secret1")``,
                    credentials are not passed to the dataset initializer.

            load_args: Keyword arguments to be passed into ``find()`` method of
                the filesystem implementation.

        Raises:
            DataSetError: If versioning is enabled for the underlying dataset.
        """
        super().__init__()

        self._path = path
        self._filename_suffix = filename_suffix
        self._protocol = infer_storage_options(self._path)["protocol"]

        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
        self._dataset_type, self._dataset_config = parse_dataset_definition(dataset)
        if VERSION_KEY in self._dataset_config:
            raise DataSetError(
                "`{}` does not support versioning of the underlying dataset. "
                "Please remove `{}` flag from the dataset definition.".format(
                    self.__class__.__name__, VERSIONED_FLAG_KEY
                )
            )

        if CREDENTIALS_KEY in self._dataset_config:
            raise DataSetError(
                "Credentials for the underlying dataset must not be specified "
                "explicitly in dataset configuration. Please put those under "
                "`dataset_credentials` key in a dictionary and pass as "
                "`credentials` argument to {} initializer.".format(
                    self.__class__.__name__
                )
            )
        self._credentials, dataset_credentials = _split_credentials(credentials)
        if dataset_credentials:
            self._dataset_config[CREDENTIALS_KEY] = dataset_credentials

        self._filepath_arg = filepath_arg
        if self._filepath_arg in self._dataset_config:
            warn(
                "`{}` key must not be specified in the dataset definition as it "
                "will be overwritten by partition path".format(self._filepath_arg)
            )

        self._load_args = deepcopy(load_args) or {}
        self._sep = self._filesystem.sep
        # since some filesystem implementations may implement a global cache
        self.invalidate_cache()