def test_pmdarima_signature_and_examples_saved_correctly(
        auto_arima_model, test_data):

    # NB: Signature inference will only work on the first element of the tuple return
    prediction = auto_arima_model.predict(n_periods=20,
                                          return_conf_int=True,
                                          alpha=0.05)
    signature_ = infer_signature(test_data, prediction[0])
    example_ = test_data[0:5].copy(deep=False)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.pmdarima.save_model(auto_arima_model,
                                           path=path,
                                           signature=signature,
                                           input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    r_example = _read_example(mlflow_model,
                                              path).copy(deep=False)
                    np.testing.assert_array_equal(r_example, example)
Esempio n. 2
0
def export_model(run, pipe, used_columns, X_val, val_pred, export_artifact):

    # Infer the signature of the model

    # Get the columns that we are really using from the pipeline
    signature = infer_signature(X_val[used_columns], val_pred)

    with tempfile.TemporaryDirectory() as temp_dir:

        export_path = os.path.join(temp_dir, "model_export")

        mlflow.sklearn.save_model(
            pipe,
            export_path,
            serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
            signature=signature,
            input_example=X_val.iloc[:2],
        )

        artifact = wandb.Artifact(
            export_artifact,
            type="model_export",
            description="Random Forest pipeline export",
        )
        artifact.add_dir(export_path)

        run.log_artifact(artifact)

        # Make sure the artifact is uploaded before the temp dir
        # gets deleted
        artifact.wait()
Esempio n. 3
0
def test_signature_and_examples_are_saved_correctly(sklearn_knn_model,
                                                    iris_data):
    data = iris_data
    signature_ = infer_signature(*data)
    example_ = data[0][:3, ]
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                with open(tmp.path("skmodel"), "wb") as f:
                    pickle.dump(sklearn_knn_model, f)
                path = tmp.path("model")
                mlflow.pyfunc.save_model(
                    path=path,
                    data_path=tmp.path("skmodel"),
                    loader_module=os.path.basename(__file__)[:-3],
                    code_path=[__file__],
                    signature=signature,
                    input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              path) == example).all())
Esempio n. 4
0
def test_pmdarima_signature_and_example_for_confidence_interval_mode(
        auto_arima_model, model_path, test_data, use_signature, use_example):
    model_path_primary = model_path.joinpath("primary")
    model_path_secondary = model_path.joinpath("secondary")
    mlflow.pmdarima.save_model(pmdarima_model=auto_arima_model,
                               path=model_path_primary)
    loaded_pyfunc = mlflow.pyfunc.load_model(model_uri=model_path_primary)
    predict_conf = pd.DataFrame([{
        "n_periods": 10,
        "return_conf_int": True,
        "alpha": 0.2
    }])
    forecast = loaded_pyfunc.predict(predict_conf)
    signature = infer_signature(test_data["orders"],
                                forecast) if use_signature else None
    example = test_data[0:10].copy(deep=False) if use_example else None
    mlflow.pmdarima.save_model(auto_arima_model,
                               path=model_path_secondary,
                               signature=signature,
                               input_example=example)
    mlflow_model = Model.load(model_path_secondary)
    assert signature == mlflow_model.signature
    if example is None:
        assert mlflow_model.saved_input_example_info is None
    else:
        r_example = _read_example(mlflow_model,
                                  model_path_secondary).copy(deep=False)
        np.testing.assert_array_equal(r_example, example)
Esempio n. 5
0
def test_signature_and_examples_saved_correctly(prophet_model):
    data = prophet_model.data
    model = prophet_model.model
    horizon_df = future_horizon_df(model, FORECAST_HORIZON)
    signature_ = infer_signature(data, model.predict(horizon_df))
    example_ = data[0:5].copy(deep=False)
    example_["y"] = pd.to_numeric(
        example_["y"])  # cast to appropriate precision
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.prophet.save_model(model,
                                          path=path,
                                          signature=signature,
                                          input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    r_example = _read_example(mlflow_model,
                                              path).copy(deep=False)
                    r_example["ds"] = pd.to_datetime(r_example["ds"],
                                                     format=DS_FORMAT)
                    np.testing.assert_array_equal(r_example, example)
Esempio n. 6
0
def test_signature_and_examples_are_saved_correctly(onnx_model, data,
                                                    onnx_custom_env):
    import mlflow.onnx

    model = onnx_model
    signature_ = infer_signature(*data)
    example_ = data[0].head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.onnx.save_model(
                    model,
                    path=path,
                    conda_env=onnx_custom_env,
                    signature=signature,
                    input_example=example,
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              path) == example).all())
def test_signature_and_examples_are_saved_correctly(iris_data, main_scoped_model_class):
    def test_predict(sk_model, model_input):
        return sk_model.predict(model_input) * 2

    data = iris_data
    signature_ = infer_signature(*data)
    example_ = data[0][
        :3,
    ]
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.pyfunc.save_model(
                    path=path,
                    artifacts={},
                    python_model=main_scoped_model_class(test_predict),
                    signature=signature,
                    input_example=example,
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert np.array_equal(_read_example(mlflow_model, path), example)
def test_pmdarima_signature_and_example_for_confidence_interval_mode(
        auto_arima_model, model_path, test_data):

    mlflow.pmdarima.save_model(pmdarima_model=auto_arima_model,
                               path=model_path)
    loaded_pyfunc = mlflow.pyfunc.load_model(model_uri=model_path)

    predict_conf = pd.DataFrame([{
        "n_periods": 10,
        "return_conf_int": True,
        "alpha": 0.2
    }])
    forecast = loaded_pyfunc.predict(predict_conf)

    signature_ = infer_signature(test_data["orders"], forecast)
    example_ = test_data[0:10].copy(deep=False)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.pmdarima.save_model(auto_arima_model,
                                           path=path,
                                           signature=signature,
                                           input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    r_example = _read_example(mlflow_model,
                                              path).copy(deep=False)
                    np.testing.assert_array_equal(r_example, example)
Esempio n. 9
0
    def after_pipeline_run(
        self,
        run_params: Dict[str, Any],
        pipeline: Pipeline,
        catalog: DataCatalog,
    ) -> None:
        """Hook to be invoked after a pipeline runs.
        Args:
            run_params: The params needed for the given run.
                Should be identical to the data logged by Journal.
                # @fixme: this needs to be modelled explicitly as code, instead of comment
                Schema: {
                    "run_id": str,
                    "project_path": str,
                    "env": str,
                    "kedro_version": str,
                    "tags": Optional[List[str]],
                    "from_nodes": Optional[List[str]],
                    "to_nodes": Optional[List[str]],
                    "node_names": Optional[List[str]],
                    "from_inputs": Optional[List[str]],
                    "load_versions": Optional[List[str]],
                    "pipeline_name": str,
                    "extra_params": Optional[Dict[str, Any]],
                }
            pipeline: The ``Pipeline`` that was run.
            catalog: The ``DataCatalog`` used during the run.
        """
        if self._is_mlflow_enabled:
            if isinstance(pipeline, PipelineML):
                with TemporaryDirectory() as tmp_dir:
                    # This will be removed at the end of the context manager,
                    # but we need to log in mlflow beforeremoving the folder
                    pipeline_catalog = pipeline._extract_pipeline_catalog(catalog)
                    artifacts = pipeline.extract_pipeline_artifacts(
                        pipeline_catalog, temp_folder=Path(tmp_dir)
                    )

                    if pipeline.model_signature == "auto":
                        input_data = pipeline_catalog.load(pipeline.input_name)
                        model_signature = infer_signature(model_input=input_data)
                    else:
                        model_signature = pipeline.model_signature

                    mlflow.pyfunc.log_model(
                        artifact_path=pipeline.model_name,
                        python_model=KedroPipelineModel(
                            pipeline_ml=pipeline,
                            catalog=pipeline_catalog,
                            **pipeline.kwargs,
                        ),
                        artifacts=artifacts,
                        conda_env=_format_conda_env(pipeline.conda_env),
                        signature=model_signature,
                    )
            # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs
            mlflow.end_run()
        else:
            switch_catalog_logging(catalog, True)
Esempio n. 10
0
        def infer_model_signature(input_example):
            if not hasattr(estimator, "predict"):
                raise Exception(
                    "the trained model does not specify a `predict` function, "
                    + "which is required in order to infer the signature")

            return infer_signature(input_example,
                                   estimator.predict(input_example))
Esempio n. 11
0
 def _infer_model_signature(input_example_slice):
     input_slice_df = _find_and_set_features_col_as_vector_if_needed(
         spark.createDataFrame(input_example_slice),
         spark_model)
     model_output = spark_model.transform(input_slice_df).drop(
         *input_slice_df.columns)
     return infer_signature(input_example_slice,
                            model_output.toPandas())
Esempio n. 12
0
def test_parse_with_schema(pandas_df_with_all_types):
    schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns])
    df = _shuffle_pdf(pandas_df_with_all_types)
    json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str,
                                                orient="split",
                                                schema=schema)
    json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str,
                                                orient="records",
                                                schema=schema)
    assert schema == infer_signature(df[schema.input_names()]).inputs

    # The current behavior with pandas json parse with type hints is weird. In some cases, the
    # types are forced ignoting overflow and loss of precision:

    bad_df = """{
      "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"],
      "data":[
        [9007199254740991.0, 1.1,                1, 1.5],
        [9007199254740992.0, 9007199254740992.0, 2, 0],
        [9007199254740994.0, 3.3,                3, "some arbitrary string"]
      ]
    }"""
    schema = Schema([
        ColSpec("integer", "bad_integer"),
        ColSpec("float", "bad_float"),
        ColSpec("float", "good_float"),
        ColSpec("string", "bad_string"),
        ColSpec("boolean", "bad_boolean"),
    ])
    df = pyfunc_scoring_server.parse_json_input(bad_df,
                                                orient="split",
                                                schema=schema)
    # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if
    # they don't fit:
    assert df["bad_integer"].dtype == np.int32
    assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648])

    # The same goes for floats:
    assert df["bad_float"].dtype == np.float32
    assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3],
                                           dtype=np.float32))
    # However bad string is recognized as int64:
    assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object))

    # Boolean is forced - zero and empty string is false, everything else is true:
    assert df["bad_boolean"].dtype == np.bool
    assert all(df["bad_boolean"] == [True, False, True])
Esempio n. 13
0
def test_signature_and_examples_are_saved_correctly(xgb_model):
    model = xgb_model.model
    for signature in (None, infer_signature(xgb_model.inference_dataframe)):
        for example in (None, xgb_model.inference_dataframe.head(3)):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.xgboost.save_model(
                    xgb_model=model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 14
0
def test_signature_and_examples_are_saved_correctly(h2o_iris_model):
    model = h2o_iris_model.model
    signature_ = infer_signature(h2o_iris_model.inference_data.as_data_frame())
    example_ = h2o_iris_model.inference_data.as_data_frame().head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.h2o.save_model(model, path=path, signature=signature, input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 15
0
def test_signature_and_examples_are_saved_correctly(model, data):
    signature_ = infer_signature(*data)
    example_ = data[0].head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.keras.save_model(
                    model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 16
0
def test_model_export_with_signature_and_examples(iris_df, spark_model_iris):
    _, _, iris_spark_df = iris_df
    signature_ = infer_signature(iris_spark_df)
    example_ = iris_spark_df.toPandas().head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                sparkm.save_model(
                    spark_model_iris.model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 17
0
def test_signature_and_examples_are_saved_correctly():
    model, _, X = ols_model()
    signature_ = infer_signature(X)
    example_ = X[0:3, :]

    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.statsmodels.save_model(
                    model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert np.array_equal(_read_example(mlflow_model, path), example)
Esempio n. 18
0
def test_model_export_with_schema_and_examples(spacy_model_with_data):
    spacy_model = spacy_model_with_data.model
    signature_ = infer_signature(spacy_model_with_data.inference_data)
    example_ = spacy_model_with_data.inference_data.head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            print(signature is None, example is None)
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.spacy.save_model(
                    spacy_model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 19
0
    def generate_report(
        self,
        model: OpenstfRegressor,
    ) -> Report:
        """Generate a report on a given model

        Args:
            model (OpenstfRegressor): the model to create a report on

        Returns:
            Report: reporter object containing info about the model
        """
        # Get training (input_data_list[0]) and validation (input_data_list[1]) set
        train_x, train_y = (
            self.input_data_list[0].iloc[:, 1:-1],
            self.input_data_list[0].iloc[:, 0],
        )
        valid_x, valid_y = (
            self.input_data_list[1].iloc[:, 1:-1],
            self.input_data_list[1].iloc[:, 0],
        )

        data_series_figures = self._make_data_series_figures(model)

        # feature_importance_dataframe should be a dataframe, to create a figure
        # can be None if we have no feature importance
        if isinstance(model.feature_importance_dataframe, pd.DataFrame):
            feature_importance_figure = figure.plot_feature_importance(
                model.feature_importance_dataframe)
        # If it isn't a dataframe we will set feature_importance_figure, so it will not create the figure
        else:
            feature_importance_figure = None

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            report = Report(
                data_series_figures=data_series_figures,
                feature_importance_figure=feature_importance_figure,
                metrics=self.get_metrics(model.predict(valid_x), valid_y),
                signature=infer_signature(train_x, train_y),
            )

        return report
def test_signature_and_examples_are_saved_correctly(gluon_model, model_data):
    model = gluon_model
    signature_ = infer_signature(model_data[0].asnumpy())
    example_ = model_data[0].asnumpy()[:3, ]
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.gluon.save_model(model,
                                        path=path,
                                        signature=signature,
                                        input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              path) == example).all())
Esempio n. 21
0
def test_diviner_signature_and_examples_saved_correctly(
    grouped_prophet, diviner_data, model_path, use_signature, use_example
):

    prediction = grouped_prophet.forecast(horizon=20, frequency="D")
    signature = infer_signature(diviner_data.df, prediction) if use_signature else None
    example = diviner_data.df[0:5].copy(deep=False) if use_example else None
    mlflow.diviner.save_model(
        grouped_prophet, path=model_path, signature=signature, input_example=example
    )
    mlflow_model = Model.load(model_path)
    assert signature == mlflow_model.signature
    if example is None:
        assert mlflow_model.saved_input_example_info is None
    else:
        r_example = _read_example(mlflow_model, model_path).copy(deep=False)
        # NB: datetime values are implicitly cast, so this needs to be reverted.
        r_example["ds"] = pd.to_datetime(r_example["ds"], format=DS_FORMAT)
        np.testing.assert_array_equal(r_example, example)
Esempio n. 22
0
def test_signature_and_examples_are_saved_correctly(sklearn_knn_model):
    data = sklearn_knn_model.inference_data
    model = sklearn_knn_model.model
    signature_ = infer_signature(data)
    example_ = data[:3, ]
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.sklearn.save_model(model,
                                          path=path,
                                          signature=signature,
                                          input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              path) == example).all())
Esempio n. 23
0
def test_pmdarima_signature_and_examples_saved_correctly(
        auto_arima_model, test_data, model_path, use_signature, use_example):

    # NB: Signature inference will only work on the first element of the tuple return
    prediction = auto_arima_model.predict(n_periods=20,
                                          return_conf_int=True,
                                          alpha=0.05)
    signature = infer_signature(test_data,
                                prediction[0]) if use_signature else None
    example = test_data[0:5].copy(deep=False) if use_example else None
    mlflow.pmdarima.save_model(auto_arima_model,
                               path=model_path,
                               signature=signature,
                               input_example=example)
    mlflow_model = Model.load(model_path)
    assert signature == mlflow_model.signature
    if example is None:
        assert mlflow_model.saved_input_example_info is None
    else:
        r_example = _read_example(mlflow_model, model_path).copy(deep=False)
        np.testing.assert_array_equal(r_example, example)
Esempio n. 24
0
def test_log_model_with_signature_and_examples(iris_df, spark_model_iris):
    _, _, iris_spark_df = iris_df
    signature_ = infer_signature(iris_spark_df)
    example_ = iris_spark_df.toPandas().head(3)
    artifact_path = "model"
    for signature in (None, signature_):
        for example in (None, example_):
            with mlflow.start_run():
                sparkm.log_model(spark_model_iris.model,
                                 artifact_path=artifact_path,
                                 signature=signature,
                                 input_example=example)
                artifact_uri = mlflow.get_artifact_uri()
                model_path = os.path.join(artifact_uri, artifact_path)
                mlflow_model = Model.load(model_path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              model_path) == example).all())
def test_signature_and_examples_are_saved_correctly(ols_model):
    model = ols_model.model
    X = ols_model.inference_dataframe
    signature_ = infer_signature(X)
    example_ = X[0:3, :]

    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.statsmodels.save_model(model,
                                              path=path,
                                              signature=signature,
                                              input_example=example)
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              path) == example).all())
def test_schema_and_examples_are_save_correctly(saved_tf_iris_model):
    train_x, train_y = iris_data_utils.load_data()[0]
    X = pd.DataFrame(train_x)
    y = pd.Series(train_y)
    for signature in (None, infer_signature(X, y)):
        for example in (None, X.head(3)):
            with TempDir() as tmp:
                path = tmp.path("model")
                mlflow.tensorflow.save_model(
                    tf_saved_model_dir=saved_tf_iris_model.path,
                    tf_meta_graph_tags=saved_tf_iris_model.meta_graph_tags,
                    tf_signature_def_key=saved_tf_iris_model.signature_def_key,
                    path=path,
                    signature=signature,
                    input_example=example,
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
Esempio n. 27
0
 def infer_model_signature(input_example):
     model_output = model.predict(input_example)
     model_signature = infer_signature(input_example, model_output)
     return model_signature
Esempio n. 28
0
    def train(*args, **kwargs):
        def record_eval_results(eval_results):
            """
            Create a callback function that records evaluation results.
            """
            def callback(env):
                eval_results.append(dict(env.evaluation_result_list))

            return callback

        if not mlflow.active_run():
            try_mlflow_log(mlflow.start_run)
            auto_end_run = True
        else:
            auto_end_run = False

        def log_feature_importance_plot(features, importance, importance_type):
            """
            Log feature importance plot.
            """
            import matplotlib.pyplot as plt

            features = np.array(features)
            importance = np.array(importance)
            indices = np.argsort(importance)
            features = features[indices]
            importance = importance[indices]
            num_features = len(features)

            # If num_features > 10, increase the figure height to prevent the plot
            # from being too dense.
            w, h = [6.4, 4.8]  # matplotlib's default figure size
            h = h + 0.1 * num_features if num_features > 10 else h
            fig, ax = plt.subplots(figsize=(w, h))

            yloc = np.arange(num_features)
            ax.barh(yloc, importance, align="center", height=0.5)
            ax.set_yticks(yloc)
            ax.set_yticklabels(features)
            ax.set_xlabel("Importance")
            ax.set_title("Feature Importance ({})".format(importance_type))
            fig.tight_layout()

            tmpdir = tempfile.mkdtemp()
            try:
                # pylint: disable=undefined-loop-variable
                filepath = os.path.join(
                    tmpdir, "feature_importance_{}.png".format(imp_type))
                fig.savefig(filepath)
                try_mlflow_log(mlflow.log_artifact, filepath)
            finally:
                plt.close(fig)
                shutil.rmtree(tmpdir)

        original = gorilla.get_original_attribute(xgboost, "train")

        # logging booster params separately via mlflow.log_params to extract key/value pairs
        # and make it easier to compare them across runs.
        params = args[0] if len(args) > 0 else kwargs["params"]
        try_mlflow_log(mlflow.log_params, params)

        unlogged_params = [
            "params",
            "dtrain",
            "evals",
            "obj",
            "feval",
            "evals_result",
            "xgb_model",
            "callbacks",
            "learning_rates",
        ]
        log_fn_args_as_params(original, args, kwargs, unlogged_params)

        all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505
        num_pos_args = len(args)

        # adding a callback that records evaluation results.
        eval_results = []
        callbacks_index = all_arg_names.index("callbacks")
        callback = record_eval_results(eval_results)
        if num_pos_args >= callbacks_index + 1:
            tmp_list = list(args)
            tmp_list[callbacks_index] += [callback]
            args = tuple(tmp_list)
        elif "callbacks" in kwargs and kwargs["callbacks"] is not None:
            kwargs["callbacks"] += [callback]
        else:
            kwargs["callbacks"] = [callback]

        # training model
        model = original(*args, **kwargs)

        # logging metrics on each iteration.
        for idx, metrics in enumerate(eval_results):
            try_mlflow_log(mlflow.log_metrics, metrics, step=idx)

        # If early_stopping_rounds is present, logging metrics at the best iteration
        # as extra metrics with the max step + 1.
        early_stopping_index = all_arg_names.index("early_stopping_rounds")
        early_stopping = (num_pos_args >= early_stopping_index + 1
                          or "early_stopping_rounds" in kwargs)
        if early_stopping:
            extra_step = len(eval_results)
            try_mlflow_log(mlflow.log_metric, "stopped_iteration",
                           len(eval_results) - 1)
            try_mlflow_log(mlflow.log_metric, "best_iteration",
                           model.best_iteration)
            try_mlflow_log(mlflow.log_metrics,
                           eval_results[model.best_iteration],
                           step=extra_step)

        # logging feature importance as artifacts.
        for imp_type in importance_types:
            imp = None
            try:
                imp = model.get_score(importance_type=imp_type)
                features, importance = zip(*imp.items())
                log_feature_importance_plot(features, importance, imp_type)
            except Exception:  # pylint: disable=broad-except
                _logger.exception(
                    "Failed to log feature importance plot. XGBoost autologging "
                    "will ignore the failure and continue. Exception: ")

            if imp is not None:
                tmpdir = tempfile.mkdtemp()
                try:
                    filepath = os.path.join(
                        tmpdir, "feature_importance_{}.json".format(imp_type))
                    with open(filepath, "w") as f:
                        json.dump(imp, f)
                    try_mlflow_log(mlflow.log_artifact, filepath)
                finally:
                    shutil.rmtree(tmpdir)

        # dtrain must exist as the original train function already ran successfully
        dtrain = args[1] if len(args) > 1 else kwargs.get("dtrain")

        input_example = None
        signature = None
        try:
            # it is possible that the dataset was constructed before the patched
            #   constructor was applied, so we cannot assume the input_example_info exists
            input_example_info = getattr(dtrain, "input_example_info", None)

            if input_example_info is None:
                raise Exception("please ensure that autologging is " +
                                "enabled before constructing the dataset.")

            input_example = input_example_info.input_example
            if input_example is None:
                # input example collection failed
                raise Exception(input_example_info.error_msg)

            model_output = model.predict(xgboost.DMatrix(input_example))
            signature = infer_signature(input_example, model_output)
        except Exception as e:  # pylint: disable=broad-except
            input_example = None
            msg = "Failed to gather example input and model signature: " + str(
                e)
            _logger.warning(msg)

        try_mlflow_log(
            log_model,
            model,
            artifact_path="model",
            signature=signature,
            input_example=input_example,
        )

        if auto_end_run:
            try_mlflow_log(mlflow.end_run)
        return model
        model_uri = mlflow.get_artifact_uri(artifact_path)

    flavor_conf = Model.load(model_uri).flavors["catboost"]
    assert "save_format" in flavor_conf
    assert flavor_conf["save_format"] == save_format

    if save_format in SUPPORTS_DESERIALIZATION:
        mlflow.catboost.load_model(model_uri)
    else:
        with pytest.raises(cb.CatBoostError,
                           match="deserialization not supported or missing"):
            mlflow.catboost.load_model(model_uri)


@pytest.mark.large
@pytest.mark.parametrize("signature", [None, infer_signature(get_iris()[0])])
@pytest.mark.parametrize("input_example", [None, get_iris()[0].head(3)])
def test_signature_and_examples_are_saved_correctly(reg_model, model_path,
                                                    signature, input_example):
    mlflow.catboost.save_model(reg_model.model,
                               model_path,
                               signature=signature,
                               input_example=input_example)
    mlflow_model = Model.load(model_path)
    assert signature == mlflow_model.signature
    if input_example is None:
        assert mlflow_model.saved_input_example_info is None
    else:
        pd.testing.assert_frame_equal(_read_example(mlflow_model, model_path),
                                      input_example)
Esempio n. 30
0
    def _log_posttraining_metadata(estimator, *args, **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        if hasattr(estimator, "score"):
            try:
                score_args = _get_args_for_score(estimator.score,
                                                 estimator.fit, args, kwargs)
                training_score = estimator.score(*score_args)
            except Exception as e:  # pylint: disable=broad-except
                msg = (
                    estimator.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        # log common metrics and artifacts for estimators (classifier, regressor)
        _log_specialized_estimator_content(estimator,
                                           mlflow.active_run().info.run_id,
                                           args, kwargs)

        input_example = None
        signature = None
        if hasattr(estimator, "predict"):
            try:
                # Fetch an input example using the first several rows of the array-like
                # training data supplied to the training routine (e.g., `fit()`)
                SAMPLE_ROWS = 5
                fit_arg_names = _get_arg_names(estimator.fit)
                X_var_name, y_var_name = fit_arg_names[:2]
                input_example = _get_Xy(args, kwargs, X_var_name,
                                        y_var_name)[0][:SAMPLE_ROWS]

                model_output = estimator.predict(input_example)
                signature = infer_signature(input_example, model_output)
            except Exception as e:  # pylint: disable=broad-except
                input_example = None
                msg = "Failed to infer an input example and model signature: " + str(
                    e)
                _logger.warning(msg)

        try_mlflow_log(
            log_model,
            estimator,
            artifact_path="model",
            signature=signature,
            input_example=input_example,
        )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_"):
                try_mlflow_log(
                    log_model,
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                try_mlflow_log(mlflow.log_params, best_params)

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    environment_tags = context_registry.resolve_tags()
                    _create_child_runs_for_parameter_search(
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        child_tags=environment_tags,
                    )
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)