コード例 #1
0
ファイル: __init__.py プロジェクト: TheVinhLuong102/mlflow
    def _log_posttraining_metadata(estimator, spark_model, params):

        if _is_parameter_search_estimator(estimator):
            try:
                # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                # information is consistent with the parent run
                child_tags = context_registry.resolve_tags()
                child_tags.update(
                    {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME})
                _create_child_runs_for_parameter_search(
                    parent_estimator=estimator,
                    parent_model=spark_model,
                    parent_run=mlflow.active_run(),
                    child_tags=child_tags,
                )
            except Exception:
                import traceback

                msg = (
                    "Encountered exception during creation of child runs for parameter search."
                    " Child runs may be missing. Exception: {}".format(
                        traceback.format_exc()))
                _logger.warning(msg)

            estimator_param_maps = _get_tuning_param_maps(
                estimator,
                estimator._autologging_metadata.uid_to_indexed_name_map)

            metrics_dict, best_index = _get_param_search_metrics_and_best_index(
                estimator, spark_model)
            _log_parameter_search_results_as_artifact(
                estimator_param_maps, metrics_dict,
                mlflow.active_run().info.run_id)

            # Log best_param_map as JSON artifact
            best_param_map = estimator_param_maps[best_index]
            mlflow.log_dict(best_param_map,
                            artifact_file="best_parameters.json")

            # Log best_param_map as autologging parameters as well
            _log_estimator_params({
                f"best_{param_name}": param_value
                for param_name, param_value in best_param_map.items()
            })

        if log_models:
            if _should_log_model(spark_model):
                # TODO: support model signature
                mlflow.spark.log_model(
                    spark_model,
                    artifact_path="model",
                )
                if _is_parameter_search_model(spark_model):
                    mlflow.spark.log_model(
                        spark_model.bestModel,
                        artifact_path="best_model",
                    )
            else:
                _logger.warning(
                    _get_warning_msg_for_skip_log_model(spark_model))
コード例 #2
0
    def experiment(self) -> MlflowClient:
        if self._experiment_id is None:
            expt = self._mlflow_client.get_experiment_by_name(
                self._experiment_name)

            if expt is not None:
                self._experiment_id = expt.experiment_id
            else:
                if self._run_id is not None:
                    raise ValueError(
                        f'Experiment with name {self._experiment_name} not found'
                    )

                logger.warning(
                    f'Experiment with name {self._experiment_name} not found. Creating it.'
                )
                self._experiment_id = self._mlflow_client.create_experiment(
                    name=self._experiment_name)

        if self._run_id is None:
            run = self._mlflow_client.create_run(
                experiment_id=self._experiment_id,
                tags=resolve_tags(self.tags))
            self._run_id = run.info.run_id

        return self._mlflow_client
コード例 #3
0
def test_resolve_tags_no_arg(mock_run_context_providers):
    assert resolve_tags() == {
        "one": "override",
        "two": "two-val",
        "three": "three-val",
        "new": "new-val"
    }
コード例 #4
0
    def log_model(self, model, artifact_path: str):
        mlflow.set_tracking_uri(self._tracking_uri)
        mlflow.set_experiment(self._experiment_name)

        with mlflow.start_run(run_id=self.run_id,
                              tags=resolve_tags(self.tags)):
            mlflow.sklearn.log_model(model, artifact_path)
コード例 #5
0
def test_resolve_tags(mock_run_context_providers):
    tags_arg = {"two": "arg-override", "arg": "arg-val"}
    assert resolve_tags(tags_arg) == {
        "one": "override",
        "two": "arg-override",
        "three": "three-val",
        "new": "new-val",
        "arg": "arg-val"
    }
コード例 #6
0
def test_run_context_provider_registry_with_installed_plugin(tmp_wkdir):
    """This test requires the package in tests/resources/mlflow-test-plugin to be installed"""

    reload(mlflow.tracking.context.registry)

    from mlflow_test_plugin import PluginRunContextProvider
    assert PluginRunContextProvider in _currently_registered_run_context_provider_classes()

    # The test plugin's context provider always returns False from in_context
    # to avoid polluting tags in developers' environments. The following mock overrides this to
    # perform the integration test.
    with mock.patch.object(PluginRunContextProvider, "in_context", return_value=True):
        assert resolve_tags()["test"] == "tag"
コード例 #7
0
    def _log_posttraining_metadata(estimator, *args, **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        if hasattr(estimator, "score"):
            try:
                score_args = _get_args_for_score(estimator.score,
                                                 estimator.fit, args, kwargs)
                training_score = estimator.score(*score_args)
            except Exception as e:
                msg = (
                    estimator.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        # log common metrics and artifacts for estimators (classifier, regressor)
        _log_specialized_estimator_content(estimator,
                                           mlflow.active_run().info.run_id,
                                           args, kwargs)

        def get_input_example():
            # Fetch an input example using the first several rows of the array-like
            # training data supplied to the training routine (e.g., `fit()`)
            fit_arg_names = _get_arg_names(estimator.fit)
            X_var_name, y_var_name = fit_arg_names[:2]
            input_example = _get_Xy(args, kwargs, X_var_name,
                                    y_var_name)[0][:INPUT_EXAMPLE_SAMPLE_ROWS]
            return input_example

        def infer_model_signature(input_example):
            if not hasattr(estimator, "predict"):
                raise Exception(
                    "the trained model does not specify a `predict` function, "
                    + "which is required in order to infer the signature")

            return infer_signature(input_example,
                                   estimator.predict(input_example))

        if log_models:
            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
            input_example, signature = resolve_input_example_and_signature(
                get_input_example,
                infer_model_signature,
                log_input_examples,
                log_model_signatures,
                _logger,
            )

            try_mlflow_log(
                log_model,
                estimator,
                artifact_path="model",
                signature=signature,
                input_example=input_example,
            )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_") and log_models:
                try_mlflow_log(
                    log_model,
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_score_"):
                try_mlflow_log(mlflow.log_metric, "best_cv_score",
                               estimator.best_score_)

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                try_mlflow_log(mlflow.log_params, best_params)

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    child_tags = context_registry.resolve_tags()
                    child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
                    _create_child_runs_for_parameter_search(
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        child_tags=child_tags,
                    )
                except Exception as e:

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)
コード例 #8
0
def start_run(run_id=None, experiment_id=None, run_name=None, nested=False):
    """
    Start a new MLflow run, setting it as the active run under which metrics and parameters
    will be logged. The return value can be used as a context manager within a ``with`` block;
    otherwise, you must call ``end_run()`` to terminate the current run.

    If you pass a ``run_id`` or the ``MLFLOW_RUN_ID`` environment variable is set,
    ``start_run`` attempts to resume a run with the specified run ID and
    other parameters are ignored. ``run_id`` takes precedence over ``MLFLOW_RUN_ID``.

    MLflow sets a variety of default tags on the run, as defined in
    :ref:`MLflow system tags <system_tags>`.

    :param run_id: If specified, get the run with the specified UUID and log parameters
                     and metrics under that run. The run's end time is unset and its status
                     is set to running, but the run's other attributes (``source_version``,
                     ``source_type``, etc.) are not changed.
    :param experiment_id: ID of the experiment under which to create the current run (applicable
                          only when ``run_id`` is not specified). If ``experiment_id`` argument
                          is unspecified, will look for valid experiment in the following order:
                          activated using ``set_experiment``, ``MLFLOW_EXPERIMENT_NAME``
                          environment variable, ``MLFLOW_EXPERIMENT_ID`` environment variable,
                          or the default experiment as defined by the tracking server.
    :param run_name: Name of new run (stored as a ``mlflow.runName`` tag).
                     Used only when ``run_id`` is unspecified.
    :param nested: Controls whether run is nested in parent run. ``True`` creates a nest run.
    :return: :py:class:`mlflow.ActiveRun` object that acts as a context manager wrapping
             the run's state.
    """
    global _active_run_stack
    # back compat for int experiment_id
    experiment_id = str(experiment_id) if isinstance(experiment_id,
                                                     int) else experiment_id
    if len(_active_run_stack) > 0 and not nested:
        raise Exception(
            ("Run with UUID {} is already active. To start a nested " +
             "run, call start_run with nested=True").format(
                 _active_run_stack[0].info.run_id))
    if run_id:
        existing_run_id = run_id
    elif _RUN_ID_ENV_VAR in os.environ:
        existing_run_id = os.environ[_RUN_ID_ENV_VAR]
        del os.environ[_RUN_ID_ENV_VAR]
    else:
        existing_run_id = None
    if existing_run_id:
        _validate_run_id(existing_run_id)
        active_run_obj = MlflowClient().get_run(existing_run_id)
        if active_run_obj.info.lifecycle_stage == LifecycleStage.DELETED:
            raise MlflowException(
                "Cannot start run with ID {} because it is in the "
                "deleted state.".format(existing_run_id))
    else:
        if len(_active_run_stack) > 0:
            parent_run_id = _active_run_stack[-1].info.run_id
        else:
            parent_run_id = None

        exp_id_for_run = experiment_id if experiment_id is not None else _get_experiment_id(
        )

        user_specified_tags = {}
        if parent_run_id is not None:
            user_specified_tags[MLFLOW_PARENT_RUN_ID] = parent_run_id
        if run_name is not None:
            user_specified_tags[MLFLOW_RUN_NAME] = run_name

        tags = context_registry.resolve_tags(user_specified_tags)

        active_run_obj = MlflowClient().create_run(
            experiment_id=exp_id_for_run, tags=tags)

    _active_run_stack.append(ActiveRun(active_run_obj))
    return _active_run_stack[-1]
コード例 #9
0
    def _log_posttraining_metadata(estimator, *args, **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        if hasattr(estimator, "score"):
            try:
                score_args = _get_args_for_score(estimator.score,
                                                 estimator.fit, args, kwargs)
                training_score = estimator.score(*score_args)
            except Exception as e:  # pylint: disable=broad-except
                msg = (
                    estimator.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        # log common metrics and artifacts for estimators (classifier, regressor)
        _log_specialized_estimator_content(estimator,
                                           mlflow.active_run().info.run_id,
                                           args, kwargs)

        input_example = None
        signature = None
        if hasattr(estimator, "predict"):
            try:
                # Fetch an input example using the first several rows of the array-like
                # training data supplied to the training routine (e.g., `fit()`)
                SAMPLE_ROWS = 5
                fit_arg_names = _get_arg_names(estimator.fit)
                X_var_name, y_var_name = fit_arg_names[:2]
                input_example = _get_Xy(args, kwargs, X_var_name,
                                        y_var_name)[0][:SAMPLE_ROWS]

                model_output = estimator.predict(input_example)
                signature = infer_signature(input_example, model_output)
            except Exception as e:  # pylint: disable=broad-except
                input_example = None
                msg = "Failed to infer an input example and model signature: " + str(
                    e)
                _logger.warning(msg)

        try_mlflow_log(
            log_model,
            estimator,
            artifact_path="model",
            signature=signature,
            input_example=input_example,
        )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_"):
                try_mlflow_log(
                    log_model,
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                try_mlflow_log(mlflow.log_params, best_params)

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    environment_tags = context_registry.resolve_tags()
                    _create_child_runs_for_parameter_search(
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        child_tags=environment_tags,
                    )
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)
コード例 #10
0
def start_run(run_id=None,
              experiment_id=None,
              run_name=None,
              nested=False,
              tags=None):
    """
    Start a new MLflow run, setting it as the active run under which metrics and parameters
    will be logged. The return value can be used as a context manager within a ``with`` block;
    otherwise, you must call ``end_run()`` to terminate the current run.

    If you pass a ``run_id`` or the ``MLFLOW_RUN_ID`` environment variable is set,
    ``start_run`` attempts to resume a run with the specified run ID and
    other parameters are ignored. ``run_id`` takes precedence over ``MLFLOW_RUN_ID``.

    If resuming an existing run, the run status is set to ``RunStatus.RUNNING``.

    MLflow sets a variety of default tags on the run, as defined in
    :ref:`MLflow system tags <system_tags>`.

    :param run_id: If specified, get the run with the specified UUID and log parameters
                     and metrics under that run. The run's end time is unset and its status
                     is set to running, but the run's other attributes (``source_version``,
                     ``source_type``, etc.) are not changed.
    :param experiment_id: ID of the experiment under which to create the current run (applicable
                          only when ``run_id`` is not specified). If ``experiment_id`` argument
                          is unspecified, will look for valid experiment in the following order:
                          activated using ``set_experiment``, ``MLFLOW_EXPERIMENT_NAME``
                          environment variable, ``MLFLOW_EXPERIMENT_ID`` environment variable,
                          or the default experiment as defined by the tracking server.
    :param run_name: Name of new run (stored as a ``mlflow.runName`` tag).
                     Used only when ``run_id`` is unspecified.
    :param nested: Controls whether run is nested in parent run. ``True`` creates a nested run.
    :param tags: An optional dictionary of string keys and values to set as tags on the new run.
    :return: :py:class:`mlflow.ActiveRun` object that acts as a context manager wrapping
             the run's state.

    .. code-block:: python
        :caption: Example

        import mlflow

        # Create nested runs
        with mlflow.start_run(run_name='PARENT_RUN') as parent_run:
            mlflow.log_param("parent", "yes")
            with mlflow.start_run(run_name='CHILD_RUN', nested=True) as child_run:
                mlflow.log_param("child", "yes")

        print("parent run_id: {}".format(parent_run.info.run_id))
        print("child run_id : {}".format(child_run.info.run_id))
        print("--")

        # Search all child runs with a parent id
        query = "tags.mlflow.parentRunId = '{}'".format(parent_run.info.run_id)
        results = mlflow.search_runs(filter_string=query)
        print(results[["run_id", "params.child", "tags.mlflow.runName"]])

    .. code-block:: text
        :caption: Output

        parent run_id: 5ec0e7ae18f54c2694ffb48c2fccf25c
        child run_id : 78b3b0d264b44cd29e8dc389749bb4be
        --
                                     run_id params.child tags.mlflow.runName
        0  78b3b0d264b44cd29e8dc389749bb4be          yes           CHILD_RUN
    """
    global _active_run_stack
    # back compat for int experiment_id
    experiment_id = str(experiment_id) if isinstance(experiment_id,
                                                     int) else experiment_id
    if len(_active_run_stack) > 0 and not nested:
        raise Exception((
            "Run with UUID {} is already active. To start a new run, first end the "
            + "current run with mlflow.end_run(). To start a nested " +
            "run, call start_run with nested=True").format(
                _active_run_stack[0].info.run_id))
    if run_id:
        existing_run_id = run_id
    elif _RUN_ID_ENV_VAR in os.environ:
        existing_run_id = os.environ[_RUN_ID_ENV_VAR]
        del os.environ[_RUN_ID_ENV_VAR]
    else:
        existing_run_id = None
    if existing_run_id:
        _validate_run_id(existing_run_id)
        active_run_obj = MlflowClient().get_run(existing_run_id)
        # Check to see if experiment_id from environment matches experiment_id from set_experiment()
        if (_active_experiment_id is not None and
                _active_experiment_id != active_run_obj.info.experiment_id):
            raise MlflowException(
                "Cannot start run with ID {} because active run ID "
                "does not match environment run ID. Make sure --experiment-name "
                "or --experiment-id matches experiment set with "
                "set_experiment(), or just use command-line "
                "arguments".format(existing_run_id))
        # Check to see if current run isn't deleted
        if active_run_obj.info.lifecycle_stage == LifecycleStage.DELETED:
            raise MlflowException(
                "Cannot start run with ID {} because it is in the "
                "deleted state.".format(existing_run_id))
        # Use previous end_time because a value is required for update_run_info
        end_time = active_run_obj.info.end_time
        _get_store().update_run_info(existing_run_id,
                                     run_status=RunStatus.RUNNING,
                                     end_time=end_time)
        active_run_obj = MlflowClient().get_run(existing_run_id)
    else:
        if len(_active_run_stack) > 0:
            parent_run_id = _active_run_stack[-1].info.run_id
        else:
            parent_run_id = None

        exp_id_for_run = experiment_id if experiment_id is not None else _get_experiment_id(
        )

        user_specified_tags = tags or {}
        if parent_run_id is not None:
            user_specified_tags[MLFLOW_PARENT_RUN_ID] = parent_run_id
        if run_name is not None:
            user_specified_tags[MLFLOW_RUN_NAME] = run_name

        tags = context_registry.resolve_tags(user_specified_tags)

        active_run_obj = MlflowClient().create_run(
            experiment_id=exp_id_for_run, tags=tags)

    _active_run_stack.append(ActiveRun(active_run_obj))
    return _active_run_stack[-1]
コード例 #11
0
ファイル: __init__.py プロジェクト: wwjiang007/mlflow
    def _log_posttraining_metadata(autologging_client, estimator, *args,
                                   **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param autologging_client: An instance of `MlflowAutologgingQueueingClient` used for
                                   efficiently logging run data to MLflow Tracking.
        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        def infer_model_signature(input_example):
            if not hasattr(estimator, "predict"):
                raise Exception(
                    "the trained model does not specify a `predict` function, "
                    + "which is required in order to infer the signature")

            return infer_signature(input_example,
                                   estimator.predict(input_example))

        (X, y_true,
         sample_weight) = _get_args_for_metrics(estimator.fit, args, kwargs)

        # log common metrics and artifacts for estimators (classifier, regressor)
        logged_metrics = _log_estimator_content(
            autologging_client=autologging_client,
            estimator=estimator,
            prefix=_TRAINING_PREFIX,
            run_id=mlflow.active_run().info.run_id,
            X=X,
            y_true=y_true,
            sample_weight=sample_weight,
        )
        if y_true is None and not logged_metrics:
            _logger.warning(
                "Training metrics will not be recorded because training labels were not specified."
                " To automatically record training metrics, provide training labels as inputs to"
                " the model training function.")

        def get_input_example():
            # Fetch an input example using the first several rows of the array-like
            # training data supplied to the training routine (e.g., `fit()`)
            input_example = X[:INPUT_EXAMPLE_SAMPLE_ROWS]
            return input_example

        if log_models:
            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
            input_example, signature = resolve_input_example_and_signature(
                get_input_example,
                infer_model_signature,
                log_input_examples,
                log_model_signatures,
                _logger,
            )

            log_model(
                estimator,
                artifact_path="model",
                signature=signature,
                input_example=input_example,
            )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_") and log_models:
                log_model(
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_score_"):
                autologging_client.log_metrics(
                    run_id=mlflow.active_run().info.run_id,
                    metrics={"best_cv_score": estimator.best_score_},
                )

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                autologging_client.log_params(
                    run_id=mlflow.active_run().info.run_id,
                    params=best_params,
                )

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    child_tags = context_registry.resolve_tags()
                    child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
                    _create_child_runs_for_parameter_search(
                        autologging_client=autologging_client,
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        max_tuning_runs=max_tuning_runs,
                        child_tags=child_tags,
                    )
                except Exception as e:

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)
コード例 #12
0
ファイル: __init__.py プロジェクト: bkbonde/mlflow
    def _log_posttraining_metadata(estimator, spark_model, params, input_df):

        if _is_parameter_search_estimator(estimator):
            try:
                # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                # information is consistent with the parent run
                child_tags = context_registry.resolve_tags()
                child_tags.update(
                    {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME})
                _create_child_runs_for_parameter_search(
                    parent_estimator=estimator,
                    parent_model=spark_model,
                    parent_run=mlflow.active_run(),
                    child_tags=child_tags,
                )
            except Exception:
                import traceback

                msg = (
                    "Encountered exception during creation of child runs for parameter search."
                    " Child runs may be missing. Exception: {}".format(
                        traceback.format_exc()))
                _logger.warning(msg)

            estimator_param_maps = _get_tuning_param_maps(
                estimator,
                estimator._autologging_metadata.uid_to_indexed_name_map)

            metrics_dict, best_index = _get_param_search_metrics_and_best_index(
                estimator, spark_model)
            _log_parameter_search_results_as_artifact(
                estimator_param_maps, metrics_dict,
                mlflow.active_run().info.run_id)

            # Log best_param_map as JSON artifact
            best_param_map = estimator_param_maps[best_index]
            mlflow.log_dict(best_param_map,
                            artifact_file="best_parameters.json")

            # Log best_param_map as autologging parameters as well
            _log_estimator_params({
                f"best_{param_name}": param_value
                for param_name, param_value in best_param_map.items()
            })

        if log_models:
            if _should_log_model(spark_model):
                from mlflow.models import infer_signature
                from mlflow.pyspark.ml._autolog import (
                    cast_spark_df_with_vector_to_array,
                    get_feature_cols,
                )
                from mlflow.spark import _find_and_set_features_col_as_vector_if_needed
                from pyspark.sql import SparkSession

                spark = SparkSession.builder.getOrCreate()

                def _get_input_example_as_pd_df():
                    feature_cols = list(
                        get_feature_cols(input_df.schema, spark_model))
                    limited_input_df = input_df.select(feature_cols).limit(
                        INPUT_EXAMPLE_SAMPLE_ROWS)
                    return cast_spark_df_with_vector_to_array(
                        limited_input_df).toPandas()

                def _infer_model_signature(input_example_slice):
                    input_slice_df = _find_and_set_features_col_as_vector_if_needed(
                        spark.createDataFrame(input_example_slice),
                        spark_model)
                    model_output = spark_model.transform(input_slice_df).drop(
                        *input_slice_df.columns)
                    return infer_signature(input_example_slice,
                                           model_output.toPandas())

                input_example, signature = resolve_input_example_and_signature(
                    _get_input_example_as_pd_df,
                    _infer_model_signature,
                    log_input_examples,
                    log_model_signatures,
                    _logger,
                )

                mlflow.spark.log_model(
                    spark_model,
                    artifact_path="model",
                    registered_model_name=registered_model_name,
                    input_example=input_example,
                    signature=signature,
                )
                if _is_parameter_search_model(spark_model):
                    mlflow.spark.log_model(
                        spark_model.bestModel,
                        artifact_path="best_model",
                    )
            else:
                _logger.warning(
                    _get_warning_msg_for_skip_log_model(spark_model))