def run_only_missing( self, pipeline: Pipeline, catalog: DataCatalog ) -> Dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ free_outputs = pipeline.outputs() - set(catalog.list()) missing = {ds for ds in catalog.list() if not catalog.exists(ds)} to_build = free_outputs | missing to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs( *to_build ) # we also need any memory data sets that feed into that # including chains of memory data sets memory_sets = pipeline.data_sets() - set(catalog.list()) output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets) input_from_memory = to_rerun.inputs() & memory_sets to_rerun += output_to_memory.to_outputs(*input_from_memory) return self.run(to_rerun, catalog)
def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]: """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ catalog = catalog.shallow_copy() unsatisfied = pipeline.inputs() - set(catalog.list()) if unsatisfied: raise ValueError("Pipeline input(s) {} not found in the " "DataCatalog".format(unsatisfied)) free_outputs = pipeline.outputs() - set(catalog.list()) unregistered_ds = pipeline.data_sets() - set(catalog.list()) for ds_name in unregistered_ds: catalog.add(ds_name, self.create_default_data_set(ds_name)) self._run(pipeline, catalog) self._logger.info("Pipeline execution completed successfully.") return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
def _start_mlflow_run(self, run_params: Dict[str, Any], pipeline: Pipeline): """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet) NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run """ node_tags = functools.reduce(set.union, [n.tags for n in pipeline.nodes]) if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and ( 'train' in run_params['tags'] or 'train' in node_tags): if mlflow.active_run() is None: # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency. # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named) if not deepcv.meta.nni_tools.is_nni_run_standalone( ): # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI nni_experiment = nni.get_experiment_id() mlflow.set_experiment(nni_experiment) mlflow.start_run(run_name=nni.get_trial_id()) # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI) mlflow.set_tag('nni_standalone_mode', False) mlflow.set_tag('nni_experiment_id', nni_experiment) mlflow.set_tag('nni_trial_id', nni.get_trial_id()) mlflow.set_tag('nni_sequence_id', nni.get_sequence_id()) else: pipeline_name = run_params['pipeline_name'].lower( ) if run_params['pipeline_name'] else 'default' mlflow.set_experiment( f'{self.project_ctx.project_name.lower()}_{pipeline_name}' ) mlflow.start_run( run_name= f'{pipeline_name.lower()}_run_{run_params["run_id"]}') mlflow.set_tag('nni_standalone_mode', True) # Log basic informations about Kedro training pipeline to mlflow mlflow.set_tags({ f'kedro_node_tag_{i}': tag for i, tag in enumerate(node_tags) }) mlflow.log_params({n: v for n, v in run_params.items() if v}) mlflow.log_param('pipeline.json', pipeline.to_json()) mlflow.log_param('pipeline.describe', pipeline.describe()) mlflow.log_param('pipeline.pipeline_datasets', pipeline.data_sets()) """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run` """ tags = { mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: self.project_ctx.package_name, mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE: mlflow.entities.SourceType.to_string( mlflow.entities.SourceType.PROJECT), mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT: inspect.getsourcefile(type(self.project_ctx)) } try: repo = git.Repo(self.project_ctx.project_path, search_parent_directories=True) git_repo_url = repo.remote( ).url if 'origin' in repo.remotes else ( repo.remotes[0].url if len(repo.remotes) > 0 else '') git_repo_url = re.sub( r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip( '.git') # Convert SSH git URL to http URL mlflow.log_param( 'commit_url', git_repo_url + f'/commit/{repo.head.commit.hexsha}/') # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now) tags.update({ mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: git_repo_url if git_repo_url else self.project_ctx.project_name, mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH: repo.active_branch.name, mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL: git_repo_url, mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT: repo.head.commit.hexsha }) # Change mlflow user to be git repository user instead of system user (if any git user is specified) git_config_reader = repo.config_reader() git_config_reader.read() user = git_config_reader.get_value('user', 'name', default=None) email = git_config_reader.get_value('user', 'email', default=None) if user or email: tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = ( str(user) + (f' <{email}>' if email else '') ) if user else str(email) except (ImportError, OSError, ValueError, IOError, KeyError, git.GitError, configparser.Error) as e: logging.warning( f'Failed to import Git or to get repository informations. Error: {e}' ) mlflow.set_tags(tags)
def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]: """ Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``. Parameters ---------- pipeline: Pipeline The ``Pipeline`` to run catalog: DataCatalog The ``DataCatalog`` from which to fetch data. run_id: str The id of the run. Returns ------- dict Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # If missing flag run missing_output pipeline and its child nodes if self.only_missing: to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets()) pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build) return super(DatalabRunner, self).run(pipeline, catalog, run_id)