Exemple #1
0
    def run_only_missing(
        self, pipeline: Pipeline, catalog: DataCatalog
    ) -> Dict[str, Any]:
        """Run only the missing outputs from the ``Pipeline`` using the
        ``DataSet``s provided by ``catalog`` and save results back to the same
        objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        free_outputs = pipeline.outputs() - set(catalog.list())
        missing = {ds for ds in catalog.list() if not catalog.exists(ds)}
        to_build = free_outputs | missing
        to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(
            *to_build
        )

        # we also need any memory data sets that feed into that
        # including chains of memory data sets
        memory_sets = pipeline.data_sets() - set(catalog.list())
        output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets)
        input_from_memory = to_rerun.inputs() & memory_sets
        to_rerun += output_to_memory.to_outputs(*input_from_memory)

        return self.run(to_rerun, catalog)
Exemple #2
0
    def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]:
        """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``
        and save results back to the same objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """

        catalog = catalog.shallow_copy()

        unsatisfied = pipeline.inputs() - set(catalog.list())
        if unsatisfied:
            raise ValueError("Pipeline input(s) {} not found in the "
                             "DataCatalog".format(unsatisfied))

        free_outputs = pipeline.outputs() - set(catalog.list())
        unregistered_ds = pipeline.data_sets() - set(catalog.list())
        for ds_name in unregistered_ds:
            catalog.add(ds_name, self.create_default_data_set(ds_name))

        self._run(pipeline, catalog)

        self._logger.info("Pipeline execution completed successfully.")

        return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
Exemple #3
0
    def _start_mlflow_run(self, run_params: Dict[str, Any],
                          pipeline: Pipeline):
        """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet)
        NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run
        """
        node_tags = functools.reduce(set.union,
                                     [n.tags for n in pipeline.nodes])
        if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and (
                'train' in run_params['tags'] or 'train' in node_tags):
            if mlflow.active_run() is None:
                # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency.
                # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named)

                if not deepcv.meta.nni_tools.is_nni_run_standalone(
                ):  # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI
                    nni_experiment = nni.get_experiment_id()
                    mlflow.set_experiment(nni_experiment)
                    mlflow.start_run(run_name=nni.get_trial_id())
                    # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI)
                    mlflow.set_tag('nni_standalone_mode', False)
                    mlflow.set_tag('nni_experiment_id', nni_experiment)
                    mlflow.set_tag('nni_trial_id', nni.get_trial_id())
                    mlflow.set_tag('nni_sequence_id', nni.get_sequence_id())
                else:
                    pipeline_name = run_params['pipeline_name'].lower(
                    ) if run_params['pipeline_name'] else 'default'
                    mlflow.set_experiment(
                        f'{self.project_ctx.project_name.lower()}_{pipeline_name}'
                    )
                    mlflow.start_run(
                        run_name=
                        f'{pipeline_name.lower()}_run_{run_params["run_id"]}')
                    mlflow.set_tag('nni_standalone_mode', True)

            # Log basic informations about Kedro training pipeline to mlflow
            mlflow.set_tags({
                f'kedro_node_tag_{i}': tag
                for i, tag in enumerate(node_tags)
            })
            mlflow.log_params({n: v for n, v in run_params.items() if v})
            mlflow.log_param('pipeline.json', pipeline.to_json())
            mlflow.log_param('pipeline.describe', pipeline.describe())
            mlflow.log_param('pipeline.pipeline_datasets',
                             pipeline.data_sets())
            """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command
            Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run`
            """
            tags = {
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                self.project_ctx.package_name,
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE:
                mlflow.entities.SourceType.to_string(
                    mlflow.entities.SourceType.PROJECT),
                mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT:
                inspect.getsourcefile(type(self.project_ctx))
            }
            try:
                repo = git.Repo(self.project_ctx.project_path,
                                search_parent_directories=True)
                git_repo_url = repo.remote(
                ).url if 'origin' in repo.remotes else (
                    repo.remotes[0].url if len(repo.remotes) > 0 else '')
                git_repo_url = re.sub(
                    r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip(
                        '.git')  # Convert SSH git URL to http URL
                mlflow.log_param(
                    'commit_url',
                    git_repo_url + f'/commit/{repo.head.commit.hexsha}/')

                # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now)
                tags.update({
                    mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                    git_repo_url
                    if git_repo_url else self.project_ctx.project_name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH:
                    repo.active_branch.name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL:
                    git_repo_url,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT:
                    repo.head.commit.hexsha
                })

                # Change mlflow user to be git repository user instead of system user (if any git user is specified)
                git_config_reader = repo.config_reader()
                git_config_reader.read()
                user = git_config_reader.get_value('user',
                                                   'name',
                                                   default=None)
                email = git_config_reader.get_value('user',
                                                    'email',
                                                    default=None)
                if user or email:
                    tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = (
                        str(user) + (f' <{email}>' if email else '')
                    ) if user else str(email)
            except (ImportError, OSError, ValueError, IOError, KeyError,
                    git.GitError, configparser.Error) as e:
                logging.warning(
                    f'Failed to import Git or to get repository informations. Error: {e}'
                )

            mlflow.set_tags(tags)
Exemple #4
0
    def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]:
        """
        Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``.

        Parameters
        ----------
        pipeline: Pipeline
            The ``Pipeline`` to run
        catalog: DataCatalog
            The ``DataCatalog`` from which to fetch data.
        run_id: str
            The id of the run.

        Returns
        -------
        dict
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        # If missing flag run missing_output pipeline and its child nodes
        if self.only_missing:
            to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets())
            pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build)

        return super(DatalabRunner, self).run(pipeline, catalog, run_id)