Example #1
0
    def cleanup_on_error(self):
        """Method ends mlflow run with correct exit status for failed runs. Note that
        this method does not work when a pipeline running in dagit fails, it seems
        that in this case a different process runs the pipeline and when it fails
        the stack trace is therefore not available. For this case we can use the
        cleanup_on_failure hook defined below.
        """
        any_error = sys.exc_info()

        if any_error[1]:
            if isinstance(any_error[1], KeyboardInterrupt):
                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
            else:
                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
Example #2
0
def end_mlflow_run_on_pipeline_finished(context, event_list):
    for event in event_list:
        if event.is_step_success:
            _cleanup_on_success(context)
        elif event.is_step_failure:
            mlf = context.resources.mlflow
            mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))
Example #3
0
    def fit_mlflow(self, clazz, func_name, *args, **kwargs):
        """
        Autologging function that performs model training by executing the training method
        referred to be `func_name` on the instance of `clazz` referred to by `self` & records
        MLflow parameters, metrics, tags, and artifacts to a corresponding MLflow Run.
        """
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        _log_pretraining_metadata(self, *args, **kwargs)

        original_fit = gorilla.get_original_attribute(clazz, func_name)
        try:
            fit_output = original_fit(self, *args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run,
                               RunStatus.to_string(RunStatus.FAILED))

            raise e

        _log_posttraining_metadata(self, *args, **kwargs)

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Example #4
0
def _maybe_set_run_terminated(active_run, status):
    """
    If the passed-in active run is defined and still running (i.e. hasn't already been terminated
    within user code), mark it as terminated with the passed-in status.
    """
    if active_run and not RunStatus.is_terminated(active_run.get_run().info.status):
        active_run.set_terminated(status)
Example #5
0
        def patch_with_managed_run(original, *args, **kwargs):
            managed_run = None
            if not mlflow.active_run():
                managed_run = try_mlflow_log(mlflow.start_run)

            try:
                result = patch_function(original, *args, **kwargs)
            except:
                if managed_run:
                    try_mlflow_log(mlflow.end_run,
                                   RunStatus.to_string(RunStatus.FAILED))
                raise
            else:
                if managed_run:
                    try_mlflow_log(mlflow.end_run,
                                   RunStatus.to_string(RunStatus.FINISHED))
                return result
Example #6
0
 def get_status(self):
     """Gets the human-readable status of the MLflow run from the tracking server."""
     if not self._active_run:
         eprint(
             "Can't get MLflow run status; the run's status has not been "
             "persisted to an accessible tracking server.")
         return None
     return RunStatus.to_string(self._active_run.get_run().info.status)
Example #7
0
        def patch_with_managed_run(original, *args, **kwargs):
            managed_run = None
            if not mlflow.active_run():
                managed_run = try_mlflow_log(create_managed_run)

            try:
                result = patch_function(original, *args, **kwargs)
            except (Exception, KeyboardInterrupt):
                # In addition to standard Python exceptions, handle keyboard interrupts to ensure
                # that runs are terminated if a user prematurely interrupts training execution
                # (e.g. via sigint / ctrl-c)
                if managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))
                raise
            else:
                if managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED))
                return result
Example #8
0
def _validate_autologging_run(autologging_integration, run_id):
    """
    For testing purposes, verifies that an MLflow run produced by an `autologging_integration`
    satisfies the following properties:

        - The run has an autologging tag whose value is the name of the autologging integration
        - The run has a terminal status (e.g., KILLED, FAILED, FINISHED)
    """
    client = MlflowClient()
    run = client.get_run(run_id)
    autologging_tag_value = run.data.tags.get(MLFLOW_AUTOLOGGING)
    assert autologging_tag_value == autologging_integration, (
        "Autologging run with id {} failed to set autologging tag with expected value. Expected: "
        "'{}', Actual: '{}'".format(run_id, autologging_integration, autologging_tag_value)
    )
    assert RunStatus.is_terminated(
        RunStatus.from_string(run.info.status)
    ), "Autologging run with id {} has a non-terminal status '{}'".format(run_id, run.info.status)
Example #9
0
    def _hook(context, event_list):
        for event in event_list:
            if event.is_step_success:
                _cleanup_on_success(context)
            elif event.is_step_failure:
                mlf = context.resources.mlflow
                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))

        return HookExecutionResult(hook_name=name, is_skipped=False)
Example #10
0
def run(uri, entry_point="main", version=None, parameters=None, experiment_id=None,
        mode=None, cluster_spec=None, git_username=None, git_password=None,
        use_conda=True, use_temp_cwd=False, storage_dir=None, block=True):
    """
    Run an MLflow project from the given URI.

    Supports downloading projects from Git URIs with a specified version, or copying them from
    the file system. For Git-based projects, a commit can be specified as the `version`.

    Raises:
      `mlflow.projects.ExecutionException` if a run launched in blocking mode is unsuccessful.

    :param uri: URI of project to run. Expected to be either a relative/absolute local filesystem
                path or a git repository URI (e.g. https://github.com/databricks/mlflow-example)
                pointing to a project directory containing an MLproject file.
    :param entry_point: Entry point to run within the project. If no entry point with the specified
                        name is found, attempts to run the project file `entry_point` as a script,
                        using "python" to run .py files and the default shell (specified by
                        environment variable $SHELL) to run .sh files.
    :param experiment_id: ID of experiment under which to launch the run.
    :param mode: Execution mode for the run. Can be set to "local" or "databricks".
    :param cluster_spec: Path to JSON file describing the cluster to use when launching a run on
                         Databricks.
    :param git_username: Username for HTTP(S) authentication with Git.
    :param git_password: Password for HTTP(S) authentication with Git.
    :param use_conda: If True (the default), creates a new Conda environment for the run and
                      installs project dependencies within that environment. Otherwise, runs the
                      project in the current environment without installing any project
                      dependencies.
    :param use_temp_cwd: Only used if `mode` is "local" and `uri` is a local directory.
                         If True, copies project to a temporary working directory before running it.
                         Otherwise (the default), runs project using `uri` (the project's path) as
                         the working directory.
    :param storage_dir: Only used if `mode` is local. MLflow will download artifacts from
                        distributed URIs passed to parameters of type 'path' to subdirectories of
                        storage_dir.
    :param block: Whether or not to block while waiting for a run to complete. Defaults to True.
                  Note that if `block` is False and mode is "local", this method will return, but
                  the current process will block when exiting until the local run completes.
                  If the current process is interrupted, any asynchronous runs launched via this
                  method will be terminated.
    :return: A `SubmittedRun` exposing information (e.g. run ID) about the launched run.
    """
    submitted_run_obj = _run(uri=uri, entry_point=entry_point, version=version,
                             parameters=parameters,
                             experiment_id=experiment_id,
                             mode=mode, cluster_spec=cluster_spec, git_username=git_username,
                             git_password=git_password, use_conda=use_conda,
                             use_temp_cwd=use_temp_cwd, storage_dir=storage_dir, block=block)
    if block:
        submitted_run_obj.wait()
        run_status = submitted_run_obj.get_status()
        if run_status and RunStatus.from_string(run_status) != RunStatus.FINISHED:
            raise ExecutionException("=== Run %s was unsuccessful, status: '%s' ===" %
                                     (submitted_run_obj.run_id, run_status))
    return submitted_run_obj
Example #11
0
    def fit_mlflow(self, func_name, *args, **kwargs):
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        # TODO: We should not log nested estimator parameters for
        # parameter search estimators (GridSearchCV, RandomizedSearchCV)

        # Chunk and truncate model parameters to avoid hitting the log_batch API limit
        for chunk in _chunk_dict(self.get_params(deep=True),
                                 chunk_size=MAX_PARAMS_TAGS_PER_BATCH):
            truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                       MAX_PARAM_VAL_LENGTH)
            try_mlflow_log(mlflow.log_params, truncated)

        try_mlflow_log(
            mlflow.set_tags,
            {
                "estimator_name":
                self.__class__.__name__,
                "estimator_class":
                self.__class__.__module__ + "." + self.__class__.__name__,
            },
        )

        original_fit = gorilla.get_original_attribute(self, func_name)
        try:
            fit_output = original_fit(*args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run,
                               RunStatus.to_string(RunStatus.FAILED))

            raise e

        if hasattr(self, "score"):
            try:
                score_args = _get_args_for_score(self.score, self.fit, args,
                                                 kwargs)
                training_score = self.score(*score_args)
            except Exception as e:  # pylint: disable=broad-except
                msg = (
                    self.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        try_mlflow_log(log_model, self, artifact_path="model")

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Example #12
0
 def from_proto(cls, proto):
     end_time = proto.end_time
     # The proto2 default scalar value of zero indicates that the run's end time is absent.
     # An absent end time is represented with a NoneType in the `RunInfo` class
     if end_time == 0:
         end_time = None
     return cls(run_uuid=proto.run_uuid, run_id=proto.run_id, experiment_id=proto.experiment_id,
                user_id=proto.user_id, status=RunStatus.to_string(proto.status),
                start_time=proto.start_time, end_time=end_time,
                lifecycle_stage=proto.lifecycle_stage, artifact_uri=proto.artifact_uri)
Example #13
0
            def _patch_implementation(self, original, *args, **kwargs):
                if not mlflow.active_run():
                    self.managed_run = try_mlflow_log(create_managed_run)

                result = super(PatchWithManagedRun, self)._patch_implementation(
                    original, *args, **kwargs
                )

                if self.managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED))

                return result
Example #14
0
 def to_proto(self):
     proto = ProtoRunInfo()
     proto.run_uuid = self.run_uuid
     proto.run_id = self.run_id
     proto.experiment_id = self.experiment_id
     proto.user_id = self.user_id
     proto.status = RunStatus.from_string(self.status)
     proto.start_time = self.start_time
     if self.end_time:
         proto.end_time = self.end_time
     if self.artifact_uri:
         proto.artifact_uri = self.artifact_uri
     proto.lifecycle_stage = self.lifecycle_stage
     return proto
Example #15
0
    def fit_mlflow(self, func_name, *args, **kwargs):
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        _log_pretraining_metadata(self, *args, **kwargs)

        original_fit = gorilla.get_original_attribute(self, func_name)
        try:
            fit_output = original_fit(*args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))

            raise e

        _log_posttraining_metadata(self, *args, **kwargs)

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Example #16
0
 def _on_exception(self, e):
     if self.managed_run:
         try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))
     super(PatchWithManagedRun, self)._on_exception(e)
Example #17
0
 def set_terminated(self, status):
     self.run_info = self.store.update_run_info(
         self.run_info.run_uuid,
         run_status=RunStatus.from_string(status),
         end_time=_get_unix_timestamp())
Example #18
0
def validate_exit_status(status_str, expected):
    assert RunStatus.from_string(status_str) == expected
Example #19
0
 def get_status(self):
     return RunStatus.to_string(self._get_status())