def cleanup_on_error(self): """Method ends mlflow run with correct exit status for failed runs. Note that this method does not work when a pipeline running in dagit fails, it seems that in this case a different process runs the pipeline and when it fails the stack trace is therefore not available. For this case we can use the cleanup_on_failure hook defined below. """ any_error = sys.exc_info() if any_error[1]: if isinstance(any_error[1], KeyboardInterrupt): mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) else: mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
def end_mlflow_run_on_pipeline_finished(context, event_list): for event in event_list: if event.is_step_success: _cleanup_on_success(context) elif event.is_step_failure: mlf = context.resources.mlflow mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))
def fit_mlflow(self, clazz, func_name, *args, **kwargs): """ Autologging function that performs model training by executing the training method referred to be `func_name` on the instance of `clazz` referred to by `self` & records MLflow parameters, metrics, tags, and artifacts to a corresponding MLflow Run. """ should_start_run = mlflow.active_run() is None if should_start_run: try_mlflow_log(mlflow.start_run) _log_pretraining_metadata(self, *args, **kwargs) original_fit = gorilla.get_original_attribute(clazz, func_name) try: fit_output = original_fit(self, *args, **kwargs) except Exception as e: if should_start_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) raise e _log_posttraining_metadata(self, *args, **kwargs) if should_start_run: try_mlflow_log(mlflow.end_run) return fit_output
def _maybe_set_run_terminated(active_run, status): """ If the passed-in active run is defined and still running (i.e. hasn't already been terminated within user code), mark it as terminated with the passed-in status. """ if active_run and not RunStatus.is_terminated(active_run.get_run().info.status): active_run.set_terminated(status)
def patch_with_managed_run(original, *args, **kwargs): managed_run = None if not mlflow.active_run(): managed_run = try_mlflow_log(mlflow.start_run) try: result = patch_function(original, *args, **kwargs) except: if managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) raise else: if managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED)) return result
def get_status(self): """Gets the human-readable status of the MLflow run from the tracking server.""" if not self._active_run: eprint( "Can't get MLflow run status; the run's status has not been " "persisted to an accessible tracking server.") return None return RunStatus.to_string(self._active_run.get_run().info.status)
def patch_with_managed_run(original, *args, **kwargs): managed_run = None if not mlflow.active_run(): managed_run = try_mlflow_log(create_managed_run) try: result = patch_function(original, *args, **kwargs) except (Exception, KeyboardInterrupt): # In addition to standard Python exceptions, handle keyboard interrupts to ensure # that runs are terminated if a user prematurely interrupts training execution # (e.g. via sigint / ctrl-c) if managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) raise else: if managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED)) return result
def _validate_autologging_run(autologging_integration, run_id): """ For testing purposes, verifies that an MLflow run produced by an `autologging_integration` satisfies the following properties: - The run has an autologging tag whose value is the name of the autologging integration - The run has a terminal status (e.g., KILLED, FAILED, FINISHED) """ client = MlflowClient() run = client.get_run(run_id) autologging_tag_value = run.data.tags.get(MLFLOW_AUTOLOGGING) assert autologging_tag_value == autologging_integration, ( "Autologging run with id {} failed to set autologging tag with expected value. Expected: " "'{}', Actual: '{}'".format(run_id, autologging_integration, autologging_tag_value) ) assert RunStatus.is_terminated( RunStatus.from_string(run.info.status) ), "Autologging run with id {} has a non-terminal status '{}'".format(run_id, run.info.status)
def _hook(context, event_list): for event in event_list: if event.is_step_success: _cleanup_on_success(context) elif event.is_step_failure: mlf = context.resources.mlflow mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED)) return HookExecutionResult(hook_name=name, is_skipped=False)
def run(uri, entry_point="main", version=None, parameters=None, experiment_id=None, mode=None, cluster_spec=None, git_username=None, git_password=None, use_conda=True, use_temp_cwd=False, storage_dir=None, block=True): """ Run an MLflow project from the given URI. Supports downloading projects from Git URIs with a specified version, or copying them from the file system. For Git-based projects, a commit can be specified as the `version`. Raises: `mlflow.projects.ExecutionException` if a run launched in blocking mode is unsuccessful. :param uri: URI of project to run. Expected to be either a relative/absolute local filesystem path or a git repository URI (e.g. https://github.com/databricks/mlflow-example) pointing to a project directory containing an MLproject file. :param entry_point: Entry point to run within the project. If no entry point with the specified name is found, attempts to run the project file `entry_point` as a script, using "python" to run .py files and the default shell (specified by environment variable $SHELL) to run .sh files. :param experiment_id: ID of experiment under which to launch the run. :param mode: Execution mode for the run. Can be set to "local" or "databricks". :param cluster_spec: Path to JSON file describing the cluster to use when launching a run on Databricks. :param git_username: Username for HTTP(S) authentication with Git. :param git_password: Password for HTTP(S) authentication with Git. :param use_conda: If True (the default), creates a new Conda environment for the run and installs project dependencies within that environment. Otherwise, runs the project in the current environment without installing any project dependencies. :param use_temp_cwd: Only used if `mode` is "local" and `uri` is a local directory. If True, copies project to a temporary working directory before running it. Otherwise (the default), runs project using `uri` (the project's path) as the working directory. :param storage_dir: Only used if `mode` is local. MLflow will download artifacts from distributed URIs passed to parameters of type 'path' to subdirectories of storage_dir. :param block: Whether or not to block while waiting for a run to complete. Defaults to True. Note that if `block` is False and mode is "local", this method will return, but the current process will block when exiting until the local run completes. If the current process is interrupted, any asynchronous runs launched via this method will be terminated. :return: A `SubmittedRun` exposing information (e.g. run ID) about the launched run. """ submitted_run_obj = _run(uri=uri, entry_point=entry_point, version=version, parameters=parameters, experiment_id=experiment_id, mode=mode, cluster_spec=cluster_spec, git_username=git_username, git_password=git_password, use_conda=use_conda, use_temp_cwd=use_temp_cwd, storage_dir=storage_dir, block=block) if block: submitted_run_obj.wait() run_status = submitted_run_obj.get_status() if run_status and RunStatus.from_string(run_status) != RunStatus.FINISHED: raise ExecutionException("=== Run %s was unsuccessful, status: '%s' ===" % (submitted_run_obj.run_id, run_status)) return submitted_run_obj
def fit_mlflow(self, func_name, *args, **kwargs): should_start_run = mlflow.active_run() is None if should_start_run: try_mlflow_log(mlflow.start_run) # TODO: We should not log nested estimator parameters for # parameter search estimators (GridSearchCV, RandomizedSearchCV) # Chunk and truncate model parameters to avoid hitting the log_batch API limit for chunk in _chunk_dict(self.get_params(deep=True), chunk_size=MAX_PARAMS_TAGS_PER_BATCH): truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) try_mlflow_log(mlflow.log_params, truncated) try_mlflow_log( mlflow.set_tags, { "estimator_name": self.__class__.__name__, "estimator_class": self.__class__.__module__ + "." + self.__class__.__name__, }, ) original_fit = gorilla.get_original_attribute(self, func_name) try: fit_output = original_fit(*args, **kwargs) except Exception as e: if should_start_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) raise e if hasattr(self, "score"): try: score_args = _get_args_for_score(self.score, self.fit, args, kwargs) training_score = self.score(*score_args) except Exception as e: # pylint: disable=broad-except msg = ( self.score.__qualname__ + " failed. The 'training_score' metric will not be recorded. Scoring error: " + str(e)) _logger.warning(msg) else: try_mlflow_log(mlflow.log_metric, "training_score", training_score) try_mlflow_log(log_model, self, artifact_path="model") if should_start_run: try_mlflow_log(mlflow.end_run) return fit_output
def from_proto(cls, proto): end_time = proto.end_time # The proto2 default scalar value of zero indicates that the run's end time is absent. # An absent end time is represented with a NoneType in the `RunInfo` class if end_time == 0: end_time = None return cls(run_uuid=proto.run_uuid, run_id=proto.run_id, experiment_id=proto.experiment_id, user_id=proto.user_id, status=RunStatus.to_string(proto.status), start_time=proto.start_time, end_time=end_time, lifecycle_stage=proto.lifecycle_stage, artifact_uri=proto.artifact_uri)
def _patch_implementation(self, original, *args, **kwargs): if not mlflow.active_run(): self.managed_run = try_mlflow_log(create_managed_run) result = super(PatchWithManagedRun, self)._patch_implementation( original, *args, **kwargs ) if self.managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED)) return result
def to_proto(self): proto = ProtoRunInfo() proto.run_uuid = self.run_uuid proto.run_id = self.run_id proto.experiment_id = self.experiment_id proto.user_id = self.user_id proto.status = RunStatus.from_string(self.status) proto.start_time = self.start_time if self.end_time: proto.end_time = self.end_time if self.artifact_uri: proto.artifact_uri = self.artifact_uri proto.lifecycle_stage = self.lifecycle_stage return proto
def fit_mlflow(self, func_name, *args, **kwargs): should_start_run = mlflow.active_run() is None if should_start_run: try_mlflow_log(mlflow.start_run) _log_pretraining_metadata(self, *args, **kwargs) original_fit = gorilla.get_original_attribute(self, func_name) try: fit_output = original_fit(*args, **kwargs) except Exception as e: if should_start_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) raise e _log_posttraining_metadata(self, *args, **kwargs) if should_start_run: try_mlflow_log(mlflow.end_run) return fit_output
def _on_exception(self, e): if self.managed_run: try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED)) super(PatchWithManagedRun, self)._on_exception(e)
def set_terminated(self, status): self.run_info = self.store.update_run_info( self.run_info.run_uuid, run_status=RunStatus.from_string(status), end_time=_get_unix_timestamp())
def validate_exit_status(status_str, expected): assert RunStatus.from_string(status_str) == expected
def get_status(self): return RunStatus.to_string(self._get_status())