Ejemplo n.º 1
0
    def cleanup_on_error(self):
        """Method ends mlflow run with correct exit status for failed runs. Note that
        this method does not work when a pipeline running in dagit fails, it seems
        that in this case a different process runs the pipeline and when it fails
        the stack trace is therefore not available. For this case we can use the
        cleanup_on_failure hook defined below.
        """
        any_error = sys.exc_info()

        if any_error[1]:
            if isinstance(any_error[1], KeyboardInterrupt):
                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
            else:
                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
Ejemplo n.º 2
0
    def fit_mlflow(self, clazz, func_name, *args, **kwargs):
        """
        Autologging function that performs model training by executing the training method
        referred to be `func_name` on the instance of `clazz` referred to by `self` & records
        MLflow parameters, metrics, tags, and artifacts to a corresponding MLflow Run.
        """
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        _log_pretraining_metadata(self, *args, **kwargs)

        original_fit = gorilla.get_original_attribute(clazz, func_name)
        try:
            fit_output = original_fit(self, *args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run,
                               RunStatus.to_string(RunStatus.FAILED))

            raise e

        _log_posttraining_metadata(self, *args, **kwargs)

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Ejemplo n.º 3
0
def end_mlflow_run_on_pipeline_finished(context, event_list):
    for event in event_list:
        if event.is_step_success:
            _cleanup_on_success(context)
        elif event.is_step_failure:
            mlf = context.resources.mlflow
            mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))
Ejemplo n.º 4
0
 def get_status(self):
     """Gets the human-readable status of the MLflow run from the tracking server."""
     if not self._active_run:
         eprint(
             "Can't get MLflow run status; the run's status has not been "
             "persisted to an accessible tracking server.")
         return None
     return RunStatus.to_string(self._active_run.get_run().info.status)
Ejemplo n.º 5
0
        def patch_with_managed_run(original, *args, **kwargs):
            managed_run = None
            if not mlflow.active_run():
                managed_run = try_mlflow_log(mlflow.start_run)

            try:
                result = patch_function(original, *args, **kwargs)
            except:
                if managed_run:
                    try_mlflow_log(mlflow.end_run,
                                   RunStatus.to_string(RunStatus.FAILED))
                raise
            else:
                if managed_run:
                    try_mlflow_log(mlflow.end_run,
                                   RunStatus.to_string(RunStatus.FINISHED))
                return result
Ejemplo n.º 6
0
        def patch_with_managed_run(original, *args, **kwargs):
            managed_run = None
            if not mlflow.active_run():
                managed_run = try_mlflow_log(create_managed_run)

            try:
                result = patch_function(original, *args, **kwargs)
            except (Exception, KeyboardInterrupt):
                # In addition to standard Python exceptions, handle keyboard interrupts to ensure
                # that runs are terminated if a user prematurely interrupts training execution
                # (e.g. via sigint / ctrl-c)
                if managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))
                raise
            else:
                if managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED))
                return result
Ejemplo n.º 7
0
    def _hook(context, event_list):
        for event in event_list:
            if event.is_step_success:
                _cleanup_on_success(context)
            elif event.is_step_failure:
                mlf = context.resources.mlflow
                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))

        return HookExecutionResult(hook_name=name, is_skipped=False)
Ejemplo n.º 8
0
    def fit_mlflow(self, func_name, *args, **kwargs):
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        # TODO: We should not log nested estimator parameters for
        # parameter search estimators (GridSearchCV, RandomizedSearchCV)

        # Chunk and truncate model parameters to avoid hitting the log_batch API limit
        for chunk in _chunk_dict(self.get_params(deep=True),
                                 chunk_size=MAX_PARAMS_TAGS_PER_BATCH):
            truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                       MAX_PARAM_VAL_LENGTH)
            try_mlflow_log(mlflow.log_params, truncated)

        try_mlflow_log(
            mlflow.set_tags,
            {
                "estimator_name":
                self.__class__.__name__,
                "estimator_class":
                self.__class__.__module__ + "." + self.__class__.__name__,
            },
        )

        original_fit = gorilla.get_original_attribute(self, func_name)
        try:
            fit_output = original_fit(*args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run,
                               RunStatus.to_string(RunStatus.FAILED))

            raise e

        if hasattr(self, "score"):
            try:
                score_args = _get_args_for_score(self.score, self.fit, args,
                                                 kwargs)
                training_score = self.score(*score_args)
            except Exception as e:  # pylint: disable=broad-except
                msg = (
                    self.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        try_mlflow_log(log_model, self, artifact_path="model")

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Ejemplo n.º 9
0
 def from_proto(cls, proto):
     end_time = proto.end_time
     # The proto2 default scalar value of zero indicates that the run's end time is absent.
     # An absent end time is represented with a NoneType in the `RunInfo` class
     if end_time == 0:
         end_time = None
     return cls(run_uuid=proto.run_uuid, run_id=proto.run_id, experiment_id=proto.experiment_id,
                user_id=proto.user_id, status=RunStatus.to_string(proto.status),
                start_time=proto.start_time, end_time=end_time,
                lifecycle_stage=proto.lifecycle_stage, artifact_uri=proto.artifact_uri)
Ejemplo n.º 10
0
            def _patch_implementation(self, original, *args, **kwargs):
                if not mlflow.active_run():
                    self.managed_run = try_mlflow_log(create_managed_run)

                result = super(PatchWithManagedRun, self)._patch_implementation(
                    original, *args, **kwargs
                )

                if self.managed_run:
                    try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FINISHED))

                return result
Ejemplo n.º 11
0
    def fit_mlflow(self, func_name, *args, **kwargs):
        should_start_run = mlflow.active_run() is None
        if should_start_run:
            try_mlflow_log(mlflow.start_run)

        _log_pretraining_metadata(self, *args, **kwargs)

        original_fit = gorilla.get_original_attribute(self, func_name)
        try:
            fit_output = original_fit(*args, **kwargs)
        except Exception as e:
            if should_start_run:
                try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))

            raise e

        _log_posttraining_metadata(self, *args, **kwargs)

        if should_start_run:
            try_mlflow_log(mlflow.end_run)

        return fit_output
Ejemplo n.º 12
0
 def _on_exception(self, e):
     if self.managed_run:
         try_mlflow_log(mlflow.end_run, RunStatus.to_string(RunStatus.FAILED))
     super(PatchWithManagedRun, self)._on_exception(e)
Ejemplo n.º 13
0
 def get_status(self):
     return RunStatus.to_string(self._get_status())