def test_pmdarima_signature_and_examples_saved_correctly( auto_arima_model, test_data): # NB: Signature inference will only work on the first element of the tuple return prediction = auto_arima_model.predict(n_periods=20, return_conf_int=True, alpha=0.05) signature_ = infer_signature(test_data, prediction[0]) example_ = test_data[0:5].copy(deep=False) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.pmdarima.save_model(auto_arima_model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, path).copy(deep=False) np.testing.assert_array_equal(r_example, example)
def export_model(run, pipe, used_columns, X_val, val_pred, export_artifact): # Infer the signature of the model # Get the columns that we are really using from the pipeline signature = infer_signature(X_val[used_columns], val_pred) with tempfile.TemporaryDirectory() as temp_dir: export_path = os.path.join(temp_dir, "model_export") mlflow.sklearn.save_model( pipe, export_path, serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE, signature=signature, input_example=X_val.iloc[:2], ) artifact = wandb.Artifact( export_artifact, type="model_export", description="Random Forest pipeline export", ) artifact.add_dir(export_path) run.log_artifact(artifact) # Make sure the artifact is uploaded before the temp dir # gets deleted artifact.wait()
def test_signature_and_examples_are_saved_correctly(sklearn_knn_model, iris_data): data = iris_data signature_ = infer_signature(*data) example_ = data[0][:3, ] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: with open(tmp.path("skmodel"), "wb") as f: pickle.dump(sklearn_knn_model, f) path = tmp.path("model") mlflow.pyfunc.save_model( path=path, data_path=tmp.path("skmodel"), loader_module=os.path.basename(__file__)[:-3], code_path=[__file__], signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_pmdarima_signature_and_example_for_confidence_interval_mode( auto_arima_model, model_path, test_data, use_signature, use_example): model_path_primary = model_path.joinpath("primary") model_path_secondary = model_path.joinpath("secondary") mlflow.pmdarima.save_model(pmdarima_model=auto_arima_model, path=model_path_primary) loaded_pyfunc = mlflow.pyfunc.load_model(model_uri=model_path_primary) predict_conf = pd.DataFrame([{ "n_periods": 10, "return_conf_int": True, "alpha": 0.2 }]) forecast = loaded_pyfunc.predict(predict_conf) signature = infer_signature(test_data["orders"], forecast) if use_signature else None example = test_data[0:10].copy(deep=False) if use_example else None mlflow.pmdarima.save_model(auto_arima_model, path=model_path_secondary, signature=signature, input_example=example) mlflow_model = Model.load(model_path_secondary) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, model_path_secondary).copy(deep=False) np.testing.assert_array_equal(r_example, example)
def test_signature_and_examples_saved_correctly(prophet_model): data = prophet_model.data model = prophet_model.model horizon_df = future_horizon_df(model, FORECAST_HORIZON) signature_ = infer_signature(data, model.predict(horizon_df)) example_ = data[0:5].copy(deep=False) example_["y"] = pd.to_numeric( example_["y"]) # cast to appropriate precision for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.prophet.save_model(model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, path).copy(deep=False) r_example["ds"] = pd.to_datetime(r_example["ds"], format=DS_FORMAT) np.testing.assert_array_equal(r_example, example)
def test_signature_and_examples_are_saved_correctly(onnx_model, data, onnx_custom_env): import mlflow.onnx model = onnx_model signature_ = infer_signature(*data) example_ = data[0].head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.onnx.save_model( model, path=path, conda_env=onnx_custom_env, signature=signature, input_example=example, ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_signature_and_examples_are_saved_correctly(iris_data, main_scoped_model_class): def test_predict(sk_model, model_input): return sk_model.predict(model_input) * 2 data = iris_data signature_ = infer_signature(*data) example_ = data[0][ :3, ] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.pyfunc.save_model( path=path, artifacts={}, python_model=main_scoped_model_class(test_predict), signature=signature, input_example=example, ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert np.array_equal(_read_example(mlflow_model, path), example)
def test_pmdarima_signature_and_example_for_confidence_interval_mode( auto_arima_model, model_path, test_data): mlflow.pmdarima.save_model(pmdarima_model=auto_arima_model, path=model_path) loaded_pyfunc = mlflow.pyfunc.load_model(model_uri=model_path) predict_conf = pd.DataFrame([{ "n_periods": 10, "return_conf_int": True, "alpha": 0.2 }]) forecast = loaded_pyfunc.predict(predict_conf) signature_ = infer_signature(test_data["orders"], forecast) example_ = test_data[0:10].copy(deep=False) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.pmdarima.save_model(auto_arima_model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, path).copy(deep=False) np.testing.assert_array_equal(r_example, example)
def after_pipeline_run( self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. Args: run_params: The params needed for the given run. Should be identical to the data logged by Journal. # @fixme: this needs to be modelled explicitly as code, instead of comment Schema: { "run_id": str, "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]], } pipeline: The ``Pipeline`` that was run. catalog: The ``DataCatalog`` used during the run. """ if self._is_mlflow_enabled: if isinstance(pipeline, PipelineML): with TemporaryDirectory() as tmp_dir: # This will be removed at the end of the context manager, # but we need to log in mlflow beforeremoving the folder pipeline_catalog = pipeline._extract_pipeline_catalog(catalog) artifacts = pipeline.extract_pipeline_artifacts( pipeline_catalog, temp_folder=Path(tmp_dir) ) if pipeline.model_signature == "auto": input_data = pipeline_catalog.load(pipeline.input_name) model_signature = infer_signature(model_input=input_data) else: model_signature = pipeline.model_signature mlflow.pyfunc.log_model( artifact_path=pipeline.model_name, python_model=KedroPipelineModel( pipeline_ml=pipeline, catalog=pipeline_catalog, **pipeline.kwargs, ), artifacts=artifacts, conda_env=_format_conda_env(pipeline.conda_env), signature=model_signature, ) # Close the mlflow active run at the end of the pipeline to avoid interactions with further runs mlflow.end_run() else: switch_catalog_logging(catalog, True)
def infer_model_signature(input_example): if not hasattr(estimator, "predict"): raise Exception( "the trained model does not specify a `predict` function, " + "which is required in order to infer the signature") return infer_signature(input_example, estimator.predict(input_example))
def _infer_model_signature(input_example_slice): input_slice_df = _find_and_set_features_col_as_vector_if_needed( spark.createDataFrame(input_example_slice), spark_model) model_output = spark_model.transform(input_slice_df).drop( *input_slice_df.columns) return infer_signature(input_example_slice, model_output.toPandas())
def test_parse_with_schema(pandas_df_with_all_types): schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="split", schema=schema) json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="records", schema=schema) assert schema == infer_signature(df[schema.input_names()]).inputs # The current behavior with pandas json parse with type hints is weird. In some cases, the # types are forced ignoting overflow and loss of precision: bad_df = """{ "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"], "data":[ [9007199254740991.0, 1.1, 1, 1.5], [9007199254740992.0, 9007199254740992.0, 2, 0], [9007199254740994.0, 3.3, 3, "some arbitrary string"] ] }""" schema = Schema([ ColSpec("integer", "bad_integer"), ColSpec("float", "bad_float"), ColSpec("float", "good_float"), ColSpec("string", "bad_string"), ColSpec("boolean", "bad_boolean"), ]) df = pyfunc_scoring_server.parse_json_input(bad_df, orient="split", schema=schema) # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if # they don't fit: assert df["bad_integer"].dtype == np.int32 assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648]) # The same goes for floats: assert df["bad_float"].dtype == np.float32 assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3], dtype=np.float32)) # However bad string is recognized as int64: assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object)) # Boolean is forced - zero and empty string is false, everything else is true: assert df["bad_boolean"].dtype == np.bool assert all(df["bad_boolean"] == [True, False, True])
def test_signature_and_examples_are_saved_correctly(xgb_model): model = xgb_model.model for signature in (None, infer_signature(xgb_model.inference_dataframe)): for example in (None, xgb_model.inference_dataframe.head(3)): with TempDir() as tmp: path = tmp.path("model") mlflow.xgboost.save_model( xgb_model=model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_signature_and_examples_are_saved_correctly(h2o_iris_model): model = h2o_iris_model.model signature_ = infer_signature(h2o_iris_model.inference_data.as_data_frame()) example_ = h2o_iris_model.inference_data.as_data_frame().head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.h2o.save_model(model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_signature_and_examples_are_saved_correctly(model, data): signature_ = infer_signature(*data) example_ = data[0].head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.keras.save_model( model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_model_export_with_signature_and_examples(iris_df, spark_model_iris): _, _, iris_spark_df = iris_df signature_ = infer_signature(iris_spark_df) example_ = iris_spark_df.toPandas().head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") sparkm.save_model( spark_model_iris.model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_signature_and_examples_are_saved_correctly(): model, _, X = ols_model() signature_ = infer_signature(X) example_ = X[0:3, :] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.statsmodels.save_model( model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert np.array_equal(_read_example(mlflow_model, path), example)
def test_model_export_with_schema_and_examples(spacy_model_with_data): spacy_model = spacy_model_with_data.model signature_ = infer_signature(spacy_model_with_data.inference_data) example_ = spacy_model_with_data.inference_data.head(3) for signature in (None, signature_): for example in (None, example_): print(signature is None, example is None) with TempDir() as tmp: path = tmp.path("model") mlflow.spacy.save_model( spacy_model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def generate_report( self, model: OpenstfRegressor, ) -> Report: """Generate a report on a given model Args: model (OpenstfRegressor): the model to create a report on Returns: Report: reporter object containing info about the model """ # Get training (input_data_list[0]) and validation (input_data_list[1]) set train_x, train_y = ( self.input_data_list[0].iloc[:, 1:-1], self.input_data_list[0].iloc[:, 0], ) valid_x, valid_y = ( self.input_data_list[1].iloc[:, 1:-1], self.input_data_list[1].iloc[:, 0], ) data_series_figures = self._make_data_series_figures(model) # feature_importance_dataframe should be a dataframe, to create a figure # can be None if we have no feature importance if isinstance(model.feature_importance_dataframe, pd.DataFrame): feature_importance_figure = figure.plot_feature_importance( model.feature_importance_dataframe) # If it isn't a dataframe we will set feature_importance_figure, so it will not create the figure else: feature_importance_figure = None with warnings.catch_warnings(): warnings.simplefilter("ignore") report = Report( data_series_figures=data_series_figures, feature_importance_figure=feature_importance_figure, metrics=self.get_metrics(model.predict(valid_x), valid_y), signature=infer_signature(train_x, train_y), ) return report
def test_signature_and_examples_are_saved_correctly(gluon_model, model_data): model = gluon_model signature_ = infer_signature(model_data[0].asnumpy()) example_ = model_data[0].asnumpy()[:3, ] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.gluon.save_model(model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_diviner_signature_and_examples_saved_correctly( grouped_prophet, diviner_data, model_path, use_signature, use_example ): prediction = grouped_prophet.forecast(horizon=20, frequency="D") signature = infer_signature(diviner_data.df, prediction) if use_signature else None example = diviner_data.df[0:5].copy(deep=False) if use_example else None mlflow.diviner.save_model( grouped_prophet, path=model_path, signature=signature, input_example=example ) mlflow_model = Model.load(model_path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, model_path).copy(deep=False) # NB: datetime values are implicitly cast, so this needs to be reverted. r_example["ds"] = pd.to_datetime(r_example["ds"], format=DS_FORMAT) np.testing.assert_array_equal(r_example, example)
def test_signature_and_examples_are_saved_correctly(sklearn_knn_model): data = sklearn_knn_model.inference_data model = sklearn_knn_model.model signature_ = infer_signature(data) example_ = data[:3, ] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.sklearn.save_model(model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_pmdarima_signature_and_examples_saved_correctly( auto_arima_model, test_data, model_path, use_signature, use_example): # NB: Signature inference will only work on the first element of the tuple return prediction = auto_arima_model.predict(n_periods=20, return_conf_int=True, alpha=0.05) signature = infer_signature(test_data, prediction[0]) if use_signature else None example = test_data[0:5].copy(deep=False) if use_example else None mlflow.pmdarima.save_model(auto_arima_model, path=model_path, signature=signature, input_example=example) mlflow_model = Model.load(model_path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: r_example = _read_example(mlflow_model, model_path).copy(deep=False) np.testing.assert_array_equal(r_example, example)
def test_log_model_with_signature_and_examples(iris_df, spark_model_iris): _, _, iris_spark_df = iris_df signature_ = infer_signature(iris_spark_df) example_ = iris_spark_df.toPandas().head(3) artifact_path = "model" for signature in (None, signature_): for example in (None, example_): with mlflow.start_run(): sparkm.log_model(spark_model_iris.model, artifact_path=artifact_path, signature=signature, input_example=example) artifact_uri = mlflow.get_artifact_uri() model_path = os.path.join(artifact_uri, artifact_path) mlflow_model = Model.load(model_path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, model_path) == example).all())
def test_signature_and_examples_are_saved_correctly(ols_model): model = ols_model.model X = ols_model.inference_dataframe signature_ = infer_signature(X) example_ = X[0:3, :] for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") mlflow.statsmodels.save_model(model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test_schema_and_examples_are_save_correctly(saved_tf_iris_model): train_x, train_y = iris_data_utils.load_data()[0] X = pd.DataFrame(train_x) y = pd.Series(train_y) for signature in (None, infer_signature(X, y)): for example in (None, X.head(3)): with TempDir() as tmp: path = tmp.path("model") mlflow.tensorflow.save_model( tf_saved_model_dir=saved_tf_iris_model.path, tf_meta_graph_tags=saved_tf_iris_model.meta_graph_tags, tf_signature_def_key=saved_tf_iris_model.signature_def_key, path=path, signature=signature, input_example=example, ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def infer_model_signature(input_example): model_output = model.predict(input_example) model_signature = infer_signature(input_example, model_output) return model_signature
def train(*args, **kwargs): def record_eval_results(eval_results): """ Create a callback function that records evaluation results. """ def callback(env): eval_results.append(dict(env.evaluation_result_list)) return callback if not mlflow.active_run(): try_mlflow_log(mlflow.start_run) auto_end_run = True else: auto_end_run = False def log_feature_importance_plot(features, importance, importance_type): """ Log feature importance plot. """ import matplotlib.pyplot as plt features = np.array(features) importance = np.array(importance) indices = np.argsort(importance) features = features[indices] importance = importance[indices] num_features = len(features) # If num_features > 10, increase the figure height to prevent the plot # from being too dense. w, h = [6.4, 4.8] # matplotlib's default figure size h = h + 0.1 * num_features if num_features > 10 else h fig, ax = plt.subplots(figsize=(w, h)) yloc = np.arange(num_features) ax.barh(yloc, importance, align="center", height=0.5) ax.set_yticks(yloc) ax.set_yticklabels(features) ax.set_xlabel("Importance") ax.set_title("Feature Importance ({})".format(importance_type)) fig.tight_layout() tmpdir = tempfile.mkdtemp() try: # pylint: disable=undefined-loop-variable filepath = os.path.join( tmpdir, "feature_importance_{}.png".format(imp_type)) fig.savefig(filepath) try_mlflow_log(mlflow.log_artifact, filepath) finally: plt.close(fig) shutil.rmtree(tmpdir) original = gorilla.get_original_attribute(xgboost, "train") # logging booster params separately via mlflow.log_params to extract key/value pairs # and make it easier to compare them across runs. params = args[0] if len(args) > 0 else kwargs["params"] try_mlflow_log(mlflow.log_params, params) unlogged_params = [ "params", "dtrain", "evals", "obj", "feval", "evals_result", "xgb_model", "callbacks", "learning_rates", ] log_fn_args_as_params(original, args, kwargs, unlogged_params) all_arg_names = inspect.getargspec(original)[0] # pylint: disable=W1505 num_pos_args = len(args) # adding a callback that records evaluation results. eval_results = [] callbacks_index = all_arg_names.index("callbacks") callback = record_eval_results(eval_results) if num_pos_args >= callbacks_index + 1: tmp_list = list(args) tmp_list[callbacks_index] += [callback] args = tuple(tmp_list) elif "callbacks" in kwargs and kwargs["callbacks"] is not None: kwargs["callbacks"] += [callback] else: kwargs["callbacks"] = [callback] # training model model = original(*args, **kwargs) # logging metrics on each iteration. for idx, metrics in enumerate(eval_results): try_mlflow_log(mlflow.log_metrics, metrics, step=idx) # If early_stopping_rounds is present, logging metrics at the best iteration # as extra metrics with the max step + 1. early_stopping_index = all_arg_names.index("early_stopping_rounds") early_stopping = (num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs) if early_stopping: extra_step = len(eval_results) try_mlflow_log(mlflow.log_metric, "stopped_iteration", len(eval_results) - 1) try_mlflow_log(mlflow.log_metric, "best_iteration", model.best_iteration) try_mlflow_log(mlflow.log_metrics, eval_results[model.best_iteration], step=extra_step) # logging feature importance as artifacts. for imp_type in importance_types: imp = None try: imp = model.get_score(importance_type=imp_type) features, importance = zip(*imp.items()) log_feature_importance_plot(features, importance, imp_type) except Exception: # pylint: disable=broad-except _logger.exception( "Failed to log feature importance plot. XGBoost autologging " "will ignore the failure and continue. Exception: ") if imp is not None: tmpdir = tempfile.mkdtemp() try: filepath = os.path.join( tmpdir, "feature_importance_{}.json".format(imp_type)) with open(filepath, "w") as f: json.dump(imp, f) try_mlflow_log(mlflow.log_artifact, filepath) finally: shutil.rmtree(tmpdir) # dtrain must exist as the original train function already ran successfully dtrain = args[1] if len(args) > 1 else kwargs.get("dtrain") input_example = None signature = None try: # it is possible that the dataset was constructed before the patched # constructor was applied, so we cannot assume the input_example_info exists input_example_info = getattr(dtrain, "input_example_info", None) if input_example_info is None: raise Exception("please ensure that autologging is " + "enabled before constructing the dataset.") input_example = input_example_info.input_example if input_example is None: # input example collection failed raise Exception(input_example_info.error_msg) model_output = model.predict(xgboost.DMatrix(input_example)) signature = infer_signature(input_example, model_output) except Exception as e: # pylint: disable=broad-except input_example = None msg = "Failed to gather example input and model signature: " + str( e) _logger.warning(msg) try_mlflow_log( log_model, model, artifact_path="model", signature=signature, input_example=input_example, ) if auto_end_run: try_mlflow_log(mlflow.end_run) return model
model_uri = mlflow.get_artifact_uri(artifact_path) flavor_conf = Model.load(model_uri).flavors["catboost"] assert "save_format" in flavor_conf assert flavor_conf["save_format"] == save_format if save_format in SUPPORTS_DESERIALIZATION: mlflow.catboost.load_model(model_uri) else: with pytest.raises(cb.CatBoostError, match="deserialization not supported or missing"): mlflow.catboost.load_model(model_uri) @pytest.mark.large @pytest.mark.parametrize("signature", [None, infer_signature(get_iris()[0])]) @pytest.mark.parametrize("input_example", [None, get_iris()[0].head(3)]) def test_signature_and_examples_are_saved_correctly(reg_model, model_path, signature, input_example): mlflow.catboost.save_model(reg_model.model, model_path, signature=signature, input_example=input_example) mlflow_model = Model.load(model_path) assert signature == mlflow_model.signature if input_example is None: assert mlflow_model.saved_input_example_info is None else: pd.testing.assert_frame_equal(_read_example(mlflow_model, model_path), input_example)
def _log_posttraining_metadata(estimator, *args, **kwargs): """ Records metadata for a scikit-learn estimator after training has completed. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ if hasattr(estimator, "score"): try: score_args = _get_args_for_score(estimator.score, estimator.fit, args, kwargs) training_score = estimator.score(*score_args) except Exception as e: # pylint: disable=broad-except msg = ( estimator.score.__qualname__ + " failed. The 'training_score' metric will not be recorded. Scoring error: " + str(e)) _logger.warning(msg) else: try_mlflow_log(mlflow.log_metric, "training_score", training_score) # log common metrics and artifacts for estimators (classifier, regressor) _log_specialized_estimator_content(estimator, mlflow.active_run().info.run_id, args, kwargs) input_example = None signature = None if hasattr(estimator, "predict"): try: # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) SAMPLE_ROWS = 5 fit_arg_names = _get_arg_names(estimator.fit) X_var_name, y_var_name = fit_arg_names[:2] input_example = _get_Xy(args, kwargs, X_var_name, y_var_name)[0][:SAMPLE_ROWS] model_output = estimator.predict(input_example) signature = infer_signature(input_example, model_output) except Exception as e: # pylint: disable=broad-except input_example = None msg = "Failed to infer an input example and model signature: " + str( e) _logger.warning(msg) try_mlflow_log( log_model, estimator, artifact_path="model", signature=signature, input_example=input_example, ) if _is_parameter_search_estimator(estimator): if hasattr(estimator, "best_estimator_"): try_mlflow_log( log_model, estimator.best_estimator_, artifact_path="best_estimator", signature=signature, input_example=input_example, ) if hasattr(estimator, "best_params_"): best_params = { "best_{param_name}".format(param_name=param_name): param_value for param_name, param_value in estimator.best_params_.items() } try_mlflow_log(mlflow.log_params, best_params) if hasattr(estimator, "cv_results_"): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run environment_tags = context_registry.resolve_tags() _create_child_runs_for_parameter_search( cv_estimator=estimator, parent_run=mlflow.active_run(), child_tags=environment_tags, ) except Exception as e: # pylint: disable=broad-except msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( str(e))) _logger.warning(msg) try: cv_results_df = pd.DataFrame.from_dict( estimator.cv_results_) _log_parameter_search_results_as_artifact( cv_results_df, mlflow.active_run().info.run_id) except Exception as e: # pylint: disable=broad-except msg = ( "Failed to log parameter search results as an artifact." " Exception: {}".format(str(e))) _logger.warning(msg)