def test_signature_inference_infers_input_and_output_as_expected(): sig0 = infer_signature(np.array([1])) assert sig0.inputs is not None assert sig0.outputs is None sig1 = infer_signature(np.array([1]), np.array([1])) assert sig1.inputs == sig0.inputs assert sig1.outputs == sig0.inputs
def test_input_examples(pandas_df_with_all_types): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert (parsed_df.drop(columns=["binary"]) == _dataframe_from_json(tmp.path(filename)) .drop(columns=["binary"])).all().all() # pass the input as dictionary instead with TempDir() as tmp: d = {name: pandas_df_with_all_types[name].values for name in pandas_df_with_all_types.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename), sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # input passed as numpy array sig = infer_signature(pandas_df_with_all_types.values) with TempDir() as tmp: example = _Example(pandas_df_with_all_types.values) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("data",)) parsed_ary = _dataframe_from_json(tmp.path(filename), schema=sig.inputs).values assert (pandas_df_with_all_types.values == parsed_ary).all().all() # pass multidimensional array with TempDir() as tmp: example = np.array([[[1, 2, 3]]]) with pytest.raises(TensorsNotSupportedException): _Example(example) # pass multidimensional array with TempDir() as tmp: example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example({"x": example, "y": example}) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]
def train_model(params): # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow. mlflow.xgboost.autolog() with mlflow.start_run(nested=True): train = xgb.DMatrix(data=X_train, label=y_train) test = xgb.DMatrix(data=X_test, label=y_test) # Pass in the test set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric # is no longer improving. booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\ evals=[(test, "test")], early_stopping_rounds=50) predictions_test = booster.predict(test) auc_score = roc_auc_score(y_test, predictions_test) mlflow.log_metric('auc', auc_score) signature = infer_signature(X_train, booster.predict(train)) mlflow.xgboost.log_model(booster, "model", signature=signature) mlflow.log_artifact(train) mlflow.log_artifact(test) # Set the loss to -1*auc_score so fmin maximizes the auc_score return { 'status': STATUS_OK, 'loss': -1 * auc_score, 'booster': booster.attributes() }
def train_model(params): # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow. mlflow.xgboost.autolog() with mlflow.start_run(nested=True): train = xgb.DMatrix(data=X_train, label=y_train) test = xgb.DMatrix(data=X_test, label=y_test) # Train booster = xgb.train(params=params, dtrain=train, num_boost_round=20,\ evals=[(test, "test")], early_stopping_rounds=10) # Evaluate on test set predictions_test = booster.predict(test) # Calculate AUC, F1 & log values auc_score = roc_auc_score(y_test, predictions_test) mlflow.log_metric('auc', auc_score) f1_score_ = f1_eval(predictions_test, test) mlflow.log_metric('F1', f1_score_) # Log model signature (for traceability) signature = infer_signature(X_train, booster.predict(train)) mlflow.xgboost.log_model(booster, "model", signature=signature) # Add tag for searchability mlflow.set_tag("model", model_name) # Set the loss to -1*auc_score so fmin maximizes the auc_score return { 'status': STATUS_OK, 'loss': -1 * auc_score, 'booster': booster.attributes() }
def model_signature(dataset: tuple) -> ModelSignature: X, y = dataset signature = infer_signature(X, y) signature.inputs.inputs[0]._name = "foo" signature.outputs.inputs[0]._name = "bar" return signature
def convert_sklearn_mlflow(clf, x_sample): signature = infer_signature(x_sample, clf.predict(x_sample)) input_example = {} for i in x_sample.columns: input_example[i] = x_sample[i][0] mlflow.sklearn.save_model(clf, "best_model", signature=signature, input_example=input_example) return
def _get_signature_and_example(self, model_name, inputs, model): model_config = self.config.models[model_name] if model_config.input in inputs: input_example_el = get_first_element(inputs[model_config.input]) signature = infer_signature(input_example_el, model.predict(input_example_el)) else: input_example_el = None signature = None return signature, input_example_el
def test_signature_inference_infers_datime_types_as_expected(): col_name = "datetime_col" test_datetime = np.datetime64("2021-01-01") test_series = pd.Series(pd.to_datetime([test_datetime])) test_df = test_series.to_frame(col_name) signature = infer_signature(test_series) assert signature.inputs == Schema([ColSpec(DataType.datetime)]) signature = infer_signature(test_df) assert signature.inputs == Schema([ColSpec(DataType.datetime, name=col_name)]) spark = pyspark.sql.SparkSession.builder.getOrCreate() spark_df = spark.range(1).selectExpr( "current_timestamp() as timestamp", "current_date() as date" ) signature = infer_signature(spark_df) assert signature.inputs == Schema( [ColSpec(DataType.datetime, name="timestamp"), ColSpec(DataType.datetime, name="date")] )
def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans): # test setting example with data frame with NaN values in it sig = infer_signature(df_with_nan) with TempDir() as tmp: example = _Example(df_with_nan) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) # by definition of NaN, NaN == NaN is False but NaN != NaN is True assert ( ((df_with_nan == parsed_df) | ((df_with_nan != df_with_nan) & (parsed_df != parsed_df))) .all() .all() ) # the frame read without schema should match except for the binary values no_schema_df = _dataframe_from_json(tmp.path(filename)) a = parsed_df.drop(columns=["binary"]) b = no_schema_df.drop(columns=["binary"]) assert ((a == b) | ((a != a) & (b != b))).all().all() # pass multidimensional array for col in dict_of_ndarrays_with_nans: input_example = dict_of_ndarrays_with_nans[col] sig = infer_signature(input_example) with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename), schema=sig.inputs) assert np.array_equal(parsed_ary, input_example, equal_nan=True) # without a schema/dtype specified, the resulting tensor will keep the None type no_schema_df = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal( no_schema_df, np.where(np.isnan(input_example), None, input_example) )
def main(name): iris = load_iris() sk_model = tree.DecisionTreeClassifier() sk_model = sk_model.fit(iris.data, iris.target) predictions = sk_model.predict(iris.data[0:5]) signature = infer_signature(iris.data, predictions) # Log model params mlflow.log_param("criterion", sk_model.criterion) mlflow.log_param("splitter", sk_model.splitter) # Log model and register the model with signature mlflow.sklearn.log_model(sk_model=sk_model, artifact_path="sklearn-cls-model", registered_model_name=name, signature=signature)
def train(trainer, experiment_name, version="1", *args, **kwargs): owd = os.getcwd() os.chdir(Paths().root_dir) mlflow.set_experiment(experiment_name) timestamp = time.strftime("%Y%m%d%H%M") run_name = f"{experiment_name}_{timestamp}" with mlflow.start_run(run_name=run_name): run_uuid = mlflow.active_run().info.run_uuid logging.info(f"MLflow Run ID: {run_uuid}") trainer.fit(*args, **kwargs) # Get training params params = trainer.get_params() # Log parameters mlflow.log_params(params) # calculate metrics metrics = {} for metric in trainer.metrics: metrics[metric] = trainer.history[metric][-1] metrics[f"val_{metric}"] = trainer.history[f"val_{metric}"][-1] metrics["loss"] = trainer.history["loss"][-1] metrics["val_loss"] = trainer.history["val_loss"][-1] # log metrics mlflow.log_metrics(metrics) # log model model = trainer.model.model model_name = trainer.model.name X_train = trainer.X_train y_pred = trainer.model.model.predict(X_train) signature = infer_signature(X_train, y_pred) mlflow.keras.log_model(model, model_name, signature=signature) models_path = Paths().model / "models" if not models_path.exists(): models_path.mkdir() model_path = models_path / model_name / version model.save(model_path) logging.info(f"Model exported at {model_path}.") os.chdir(owd)
def test_autolog_logs_signature_and_input_example(data_type): mlflow.sklearn.autolog(log_input_example=True, log_model_signature=True) X, y = get_iris() X = data_type(X) y = data_type(y) model = sklearn.linear_model.LinearRegression() with mlflow.start_run() as run: model.fit(X, y) model_path = os.path.join(run.info.artifact_uri, MODEL_DIR) model_conf = get_model_conf(run.info.artifact_uri) input_example = _read_example(model_conf, model_path) pyfunc_model = mlflow.pyfunc.load_model(model_path) assert model_conf.signature == infer_signature(X, model.predict(X[:5])) np.testing.assert_array_equal(pyfunc_model.predict(input_example), model.predict(X[:5]))
def mlflow_register(dfXy, model_dict: dict, stats: dict, mlflow_pars: dict): log("#### Using mlflow #########################################################" ) # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"): from run_mlflow import register from mlflow.models.signature import infer_signature train_signature = dfXy[model_dict['data_pars']['cols_model']] y_signature = dfXy[model_dict['data_pars']['coly']] signature = infer_signature(train_signature, y_signature) register(run_name=model_dict['global_pars']['config_name'], params=model_dict['global_pars'], metrics=stats["metrics_test"], signature=signature, model_class=model_dict['model_pars']["model_class"], tracking_uri=mlflow_pars.get('tracking_db', "sqlite:///mlflow_local.db"))
def train(data): df = pd.read_csv(data) train, test = train_test_split(df, test_size=0.1, shuffle=True, random_state=42) X_train = train.text X_test = test.text xgboost = xgboost_model() with mlflow.start_run(run_name="xgboost_experiment") as run: tic = time.time() model_path = os.path.join('models', run.info.run_id) xgboost.fit(X_train, train['label']) duration_training = time.time() - tic mlflow.pyfunc.save_model(path=model_path, python_model=xgboost) loaded_model = mlflow.pyfunc.load_pyfunc(model_path) tic = time.time() model_output = loaded_model.predict(X_test) # acc = accuracy_score(test['label'],[i['label'] for i in model_output]) class_report = classification_report(test['label'], model_output['label'], output_dict=True) print(model_path) # ocrmodel.predict() duration_prediction = time.time() - tic mlflow.log_metric("Load Model Time", duration_training) mlflow.log_metric("predict Time", duration_prediction) # confusion_matrices = confusion_matrix(test['label'], model_output['label']) mlflow.log_metric("accuracy_score", class_report['accuracy']) mlflow.log_metric('precision', class_report['weighted avg']['precision']) mlflow.log_metric("recall", class_report['weighted avg']['recall']) mlflow.log_param('input', data) signature = infer_signature(X_train, loaded_model.predict(X_train)) # mlflow.pyfunc.log_model(loaded_model, "model") mlflow.sklearn.log_model(loaded_model, "model", signature=signature) mlflow.end_run()
def build_model(algorithm, sample_size): with mlflow.start_run(): train_df = pd.read_csv( f'datasets/processed/aws/train_{sample_size}.csv') test_df = pd.read_csv(f'datasets/processed/aws/test_{sample_size}.csv') mlflow.log_param("algorithm", algorithm) mlflow.log_param("use_tokenizer", False) mlflow.log_param("remove_stop_words", False) mlflow.log_param("sample_size", sample_size) model = modelUtils.build_model(algorithm) model.fit(train_df['text'], train_df['label']) y_pred = model.predict(test_df['text']) signature = infer_signature(train_df['text'], y_pred) mlflow.sklearn.log_model(model, algorithm, signature=signature) mlflow.log_metric("f1_score", f1_score(test_df['label'], y_pred, average='micro'))
def fit_model(model_feature_lookups, n_iter=10): with mlflow.start_run(): training_set = fs.create_training_set(outputDF, model_feature_lookups, label=label, exclude_columns=key) # Convert to pandas Dataframe training_pd = training_set.load_df().toPandas() X = training_pd.drop(label, axis=1) y = training_pd[label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Add weights given class imbalance damage_weight = 1.0 / y_train.sum() healthy_weight = 1.0 / (len(y) - y_train.sum()) sample_weight = y_train.map(lambda damaged: damage_weight if damage_weight else healthy_weight) # Not attempting to tune the model at all for purposes here gb_classifier = GradientBoostingClassifier(n_iter_no_change=n_iter) # Encode categorical cols (if any) # encoders = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), X.columns[X.dtypes == 'object'])]) pipeline = Pipeline([("gb_classifier", gb_classifier)]) pipeline_model = pipeline.fit(X_train, y_train, gb_classifier__sample_weight=sample_weight) mlflow.log_metric('test_accuracy', pipeline_model.score(X_test, y_test)) # mlflow.shap.log_explanation(gb_classifier.predict, encoders.transform(X)) fs.log_model(pipeline_model, "model", flavor=mlflow.sklearn, training_set=training_set, registered_model_name=model_name, input_example=X[:100], signature=infer_signature(X, y))
def test_autolog_logs_signature_and_input_example(data_type): mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True) X, y = get_iris() X = data_type(X) y = data_type(y) model = sklearn.linear_model.LinearRegression() with mlflow.start_run() as run: model.fit(X, y) model_path = os.path.join(run.info.artifact_uri, MODEL_DIR) model_conf = get_model_conf(run.info.artifact_uri) input_example = _read_example(model_conf, model_path) pyfunc_model = mlflow.pyfunc.load_model(model_path) assert model_conf.signature == infer_signature(X, model.predict(X[:5])) # On GitHub Actions, `pyfunc_model.predict` and `model.predict` sometimes return # slightly different results: # # >>> pyfunc_model.predict(input_example) # [[0.171504346208176 ] # [0.34346150441640155] <- diff # [0.06895096846585114] <- diff # [0.05925789882165455] # [0.03424907823290102]] # # >>> model.predict(X[:5]) # [[0.171504346208176 ] # [0.3434615044164018 ] <- diff # [0.06895096846585136] <- diff # [0.05925789882165455] # [0.03424907823290102]] # # As a workaround, use `assert_array_almost_equal` instead of `assert_array_equal` np.testing.assert_array_almost_equal(pyfunc_model.predict(input_example), model.predict(X[:5]))
def build_mlflow_model(homedir): from sklearn import datasets from sklearn.ensemble import RandomForestClassifier import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import pandas as pd from mlflow.tracking._model_registry import fluent mlflow.set_tracking_uri('http://localhost:5000') with mlflow.start_run() as run: iris = datasets.load_iris() iris_train = pd.DataFrame(iris.data, columns=iris.feature_names) clf = RandomForestClassifier(max_depth=7, random_state=0) clf.fit(iris_train, iris.target) signature = infer_signature(iris_train, clf.predict(iris_train)) model_name = "iris_rf" mlflow.sklearn.log_model(clf, model_name, signature=signature, registered_model_name=model_name) logging.info('runs:', os.fwalk(homedir)) return fluent.MlflowClient().get_model_version_download_uri( name=model_name, version=1)
def run_train(config_name, config_path="source/config_model.py", n_sample=5000, mode="run_preprocess", model_dict=None, return_mode='file', **kw): """ Configuration of the model is in config_model.py file :param config_name: :param config_path: :param n_sample: :return: """ model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None) m = model_dict['global_pars'] path_data_train = m['path_data_train'] path_train_X = m.get('path_train_X', path_data_train + "/features.zip") #.zip path_train_y = m.get('path_train_y', path_data_train + "/target.zip") #.zip path_output = m['path_train_output'] # path_model = m.get('path_model', path_output + "/model/" ) path_pipeline = m.get('path_pipeline', path_output + "/pipeline/" ) path_features_store = m.get('path_features_store', path_output + '/features_store/' ) #path_data_train replaced with path_output, because preprocessed files are stored there path_check_out = m.get('path_check_out', path_output + "/check/" ) log(path_output) log("#### load input column family ##################################################") try : cols_group = model_dict['data_pars']['cols_input_type'] ### the model config file except : cols_group = json.load(open(path_data_train + "/cols_group.json", mode='r')) log(cols_group) log("#### Preprocess ################################################################") preprocess_pars = model_dict['model_pars']['pre_process_pars'] if mode == "run_preprocess" : dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline, ### path to save preprocessing pipeline cols_group, ### dict of column family n_sample, preprocess_pars, path_features_store ### Store intermediate dataframe ) elif mode == "load_preprocess" : #### Load existing data dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store=path_features_store) ### Actual column names for label y and Input X (colnum , colcat) model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ] , []) #### Col Group to model input : Sparse, continuous, .... (ie Neural Network ## 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] ## model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items() : model_dict['data_pars']['cols_model_type2'][colg] = sum([ cols[colgroup] for colgroup in colg_list ] , []) log("#### Train model: #############################################################") log(str(model_dict)[:1000]) post_process_fun = model_dict['model_pars']['post_process_fun'] dfXy, dfXytest,stats = train(model_dict, dfXy, cols, post_process_fun) if mlflow_pars is not None: log("#### Using mlflow #########################################################") # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"): from run_mlflow import register from mlflow.models.signature import infer_signature train_signature = dfXy[model_dict['data_pars']['cols_model']] y_signature = dfXy[model_dict['data_pars']['coly']] signature = infer_signature(train_signature, y_signature) register( run_name = model_dict['global_pars']['config_name'], params = model_dict['global_pars'], metrics = stats["metrics_test"], signature = signature, model_class = model_dict['model_pars']["model_class"], tracking_uri = mlflow_pars.get( 'tracking_db', "sqlite:///mlflow_local.db") ) if return_mode == 'dict' : return { 'dfXy' : dfXy, 'dfXytest': dfXytest, 'stats' : stats } else : log("#### Export ##################################################################") os.makedirs(path_check_out, exist_ok=True) colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"] dfXy[colexport].reset_index().to_csv(path_check_out + "/pred_check.csv") # Only results dfXy.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet #dfXy.to_csv(path_check_out + "/dfX.csv") # train input data generate csv dfXytest.to_parquet(path_check_out + "/dfXtest.parquet") # Test input data generate parquet #dfXytest.to_csv(path_check_out + "/dfXtest.csv") # Test input data generate csv log("######### Finish #############################################################", )
mlflow.set_tag("features", str(X_train.columns.values.tolist())) # Log tracked parameters only mlflow.log_params(run_parameters) mlflow.log_metrics({ 'RMSE_CV': score_cv.mean(), 'RMSE': score, }) # log training loss for s in model.train_score_: mlflow.log_metric("Train Loss", s) # get model signature signature = infer_signature(model_input=X_train, model_output=model.predict(X_train)) # Save model to artifacts mlflow.sklearn.log_model(model, "model", signature=signature) # log charts mlflow.log_artifacts(model_artifacts_dir) # optional: auto-logging for scikit-learn estimators # mlflow.sklearn.autolog() # optional: log all model parameters # mlflow.log_params(model.get_params()) print(f"Run {run_id}:", "Logging completed")
# Starts runs with different XGBoost parameters for md in args.max_depth.split(','): for lr in args.learning_rate.split(','): for ssr in args.subsample.split(','): # Creates an execution context for a single run with given parameters (`md`, `lr`, and `ssr`) with mlflow.start_run(run_name=args.run_name) as run: clf = xgb.XGBClassifier(max_depth=int(md), learning_rate=float(lr), nthread=-1, subsample=float(ssr)) clf.fit(X_train, y_train) # Computes a metric for the built model pred = clf.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, pred)) # For better tracking, stores the training logs and the built model # in the MLflow logging framework # TODO: Saves a graphviz image for feature importances in XGBoost from mlflow.models.signature import infer_signature infer_signature(X_train, y_test) # This feature implemented in MLflow v1.12.0 # mlflow.shap.log_explanation(clf, X_train) mlflow.set_tag('training algorithm', 'xgboost') mlflow.log_metrics({'RMSE': rmse}) mlflow.xgboost.log_model(clf, 'model') print('XGBoost model (max_depth=%s, learning_rate=%s, subsample=%s):' % (md, lr, ssr)) print(' RMSE: %f' % rmse)
def test_parameter_search_estimators_produce_expected_outputs( cv_class, search_space, backend): mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True) svc = sklearn.svm.SVC() cv_model = cv_class(svc, search_space, n_jobs=5, return_train_score=True) X, y = get_iris() def train_cv_model(): if backend is None: cv_model.fit(X, y) else: with sklearn.utils.parallel_backend(backend=backend): cv_model.fit(X, y) with mlflow.start_run() as run: train_cv_model() run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) expected_cv_params = truncate_dict( stringify_dict_values(cv_model.get_params(deep=False))) expected_cv_params.update({ "best_{}".format(param_name): str(param_value) for param_name, param_value in cv_model.best_params_.items() }) assert params == expected_cv_params assert { TRAINING_SCORE: cv_model.score(X, y), "best_cv_score": cv_model.best_score_, }.items() <= metrics.items() assert tags == get_expected_class_tags(cv_model) assert MODEL_DIR in artifacts assert "best_estimator" in artifacts assert "cv_results.csv" in artifacts best_estimator = mlflow.sklearn.load_model( "runs:/{}/best_estimator".format(run_id)) assert isinstance(best_estimator, sklearn.svm.SVC) cv_model = mlflow.sklearn.load_model("runs:/{}/{}".format( run_id, MODEL_DIR)) assert isinstance(cv_model, cv_class) # Ensure that a signature and input example are produced for the best estimator best_estimator_conf = get_model_conf(run.info.artifact_uri, "best_estimator") assert best_estimator_conf.signature == infer_signature( X, best_estimator.predict(X[:5])) best_estimator_path = os.path.join(run.info.artifact_uri, "best_estimator") input_example = _read_example(best_estimator_conf, best_estimator_path) best_estimator.predict( input_example) # Ensure that input example evaluation succeeds client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) cv_results = pd.DataFrame.from_dict(cv_model.cv_results_) # We expect to have created a child run for each point in the parameter search space assert len(child_runs) == len(cv_results) # Verify that each set of parameter search results has a corresponding MLflow run # with the expected data for _, result in cv_results.iterrows(): result_params = result.get("params", {}) params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key, value) for key, value in result_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) _, child_metrics, child_tags, _ = get_run_data(child_run.info.run_id) assert child_tags == get_expected_class_tags(svc) assert "mean_test_score" in child_metrics.keys() assert "std_test_score" in child_metrics.keys() # Ensure that we do not capture separate metrics for each cross validation split, which # would produce very noisy metrics results assert len([ metric for metric in child_metrics.keys() if metric.startswith("split") ]) == 0
def train( model: str, experiment_name: str = None, data_dir=None, root_dir=None, best_metric="val_accuracy", **kwargs, ): """Base method to train a model. Will train the model input based on `MODEL_DICT` correspondance, and define the `experiment_name` in MlFlow tracking. Args: model (str): the model to train. Only two choices: `model1` or `model2`. experiment_name (str, optional): The experiment name to define in MlFlow tracking server. Defaults to None. If None, will be define with `model` value. best_metric (str, optional): The metrics on which performing evaluation of the model, and to check if performance has improved since best last model. Defaults to "val_accuracy". """ _check_input(model) if experiment_name is None: experiment_name = model owd = os.getcwd() root_dir = Paths(root_dir=root_dir).root_dir os.chdir(root_dir) mlflow.set_experiment(experiment_name) tracker = MlFlowTracker(root_dir=root_dir) print(tracker.root_dir) timestamp = time.strftime("%Y%m%d%H%M") run_name = f"{experiment_name}_{timestamp}" learner = MODEL_DICT.get(model)(data_dir=data_dir) print(learner.name) version = tracker.get_new_version(experiment_name) logging.info(version) with mlflow.start_run(run_name=run_name): run_uuid = mlflow.active_run().info.run_uuid logging.info(f"MLflow Run ID: {run_uuid}") learner.train(**kwargs) # Get training params params = learner.get_params() # Log parameters mlflow.log_params(params) # calculate metrics metrics = {} for metric in learner.metrics: metrics[metric] = learner.history[metric][-1] metrics[f"val_{metric}"] = learner.history[f"val_{metric}"][-1] metrics["loss"] = learner.history["loss"][-1] metrics["val_loss"] = learner.history["val_loss"][-1] final_metric = metrics.get(best_metric) # log metrics mlflow.log_metrics(metrics) # log model model_name = learner.model.name X_train = learner.X_train y_pred = learner.predict(X_train) signature = infer_signature(X_train, y_pred) mlflow.keras.log_model(learner.model.model, model_name, signature=signature, save_format="tf") models_path = Paths(root_dir=root_dir).model / "models" if not models_path.exists(): models_path.mkdir() final_metric_best = tracker.get_best_model_metric(experiment_name, metric=best_metric) if final_metric >= final_metric_best: logging.info( "Best model found. Saving to model dir to use with Tensorflow Serving" ) model_path = os.path.join(str(models_path), model) if not os.path.exists(model_path): os.mkdir(model_path) logging.info(f"Folder ") if model == "model2": tfmodel = TFModel(learner.model.model) tf.saved_model.save( tfmodel.model, os.path.join(model_path, "0"), signatures={"serving_default": tfmodel.prediction}, ) print(tfmodel) else: learner.model.model.save(os.path.join(model_path, "0")) logging.info(f"Model exported at {model_path}.") else: logging.info( f"Model logged but best performance not improved for experiment {experiment_name} (current version: {version})." ) os.chdir(owd)
def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert ((parsed_df.drop(columns=["binary"]) == _dataframe_from_json( tmp.path(filename)).drop(columns=["binary"])).all().all()) # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder new_df = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) # pass the input as dictionary instead with TempDir() as tmp: d = {name: new_df[name].values for name in new_df.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_dict = _read_tensor_input_from_json(tmp.path(filename)) assert d.keys() == parsed_dict.keys() # Asserting binary will fail since it is converted to base64 encoded strings. # The check above suffices that the binary input is stored. del d["binary"] for key in d: assert np.array_equal(d[key], parsed_dict[key]) # input passed as numpy array new_df = pandas_df_with_all_types.drop(columns=["binary"]) for col in new_df: input_example = new_df[col].to_numpy() with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array for col in dict_of_ndarrays: input_example = dict_of_ndarrays[col] with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array as a list example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example([example, example]) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]
def train(data_conf, model_conf, **kwargs): try: print() print("-----------------------------------") print(" Model Training ") print("-----------------------------------") print() # ============================== # 1.0 Data Loading # ============================== # Loading of dataset iris = load_iris() #The Iris dataset is available through the scikit-learn API idx = list(range(len(iris.target))) np.random.shuffle(idx) #We shuffle it (important if we want to split in train and test sets) X = iris.data[idx] y = iris.target[idx] # Load data in Pandas dataFrame data_pd = pd.DataFrame(data=np.column_stack((X,y)), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) data_pd.loc[data_pd['label']==0,'species'] = 'setosa' data_pd.loc[data_pd['label']==1,'species'] = 'versicolor' data_pd.loc[data_pd['label']==2,'species'] = 'virginica' data_pd.head() # Feature selection feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] target = 'label' X = data_pd[feature_cols].values y = data_pd[target].values # Creation of train and test datasets x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.7, stratify=y) #stratify=y ensures that the same proportion of labels are in both train and test sets! # Save test dataset test_pd = pd.DataFrame(data=np.column_stack((x_test,y_test)), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']) test_pd.loc[data_pd['label']==0,'species'] = 'setosa' test_pd.loc[data_pd['label']==1,'species'] = 'versicolor' test_pd.loc[data_pd['label']==2,'species'] = 'virginica' test_df = spark.createDataFrame(test_pd) test_df.write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format('test_data_sklearn_rf')) print("Step 1.0 completed: Loaded Iris dataset in Pandas") except Exception as e: print("Errored on 1.0: data loading") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ======================================== # 1.1 Model training # ======================================== with mlflow.start_run() as run: # Model definition max_depth = int(model_conf['hyperparameters']['max_depth']) n_estimators = int(model_conf['hyperparameters']['n_estimators']) max_features = model_conf['hyperparameters']['max_features'] criterion = model_conf['hyperparameters']['criterion'] class_weight = model_conf['hyperparameters']['class_weight'] bootstrap = bool(model_conf['hyperparameters']['bootstrap']) clf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, criterion=criterion, class_weight=class_weight, bootstrap=bootstrap, random_state=21, n_jobs=-1) # Fit of the model on the training set model = clf.fit(x_train, y_train) # Log the model within the MLflow run mlflow.log_param("max_depth", str(max_depth)) mlflow.log_param("n_estimators", str(n_estimators)) mlflow.log_param("max_features", str(max_features)) mlflow.log_param("criterion", str(criterion)) mlflow.log_param("class_weight", str(class_weight)) mlflow.log_param("bootstrap", str(bootstrap)) mlflow.log_param("max_features", str(max_features)) signature = infer_signature(x_train, clf.predict(x_train)) mlflow.sklearn.log_model(model, "model", registered_model_name="sklearn-rf", signature=signature) print("Step 1.1 completed: model training and saved to MLFlow") except Exception as e: print("Errored on step 1.1: model training") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e print()
def train(self, max_depth, max_leaf_nodes, model_name, output_path): with mlflow.start_run(run_name=self.run_origin) as run: # NOTE: mlflow CLI ignores run_name if self.autolog: mlflow.sklearn.autolog() run_id = run.info.run_uuid experiment_id = run.info.experiment_id print("MLflow:") print(" run_id:", run_id) print(" experiment_id:", experiment_id) print(" experiment_name:", client.get_experiment(experiment_id).name) # MLflow tags mlflow.set_tag("autolog",self.autolog) mlflow.set_tag("save_signature",self.save_signature) mlflow.set_tag("mlflow.runName", self.run_origin) # mlflow CLI picks this up mlflow.set_tag("data_path", self.data_path) mlflow.set_tag("run_origin", self.run_origin) mlflow.set_tag("version.mlflow", mlflow.__version__) mlflow.set_tag("version.sklearn", sklearn.__version__) mlflow.set_tag("version.platform", platform.platform()) mlflow.set_tag("version.python", platform.python_version()) mlflow.set_tag("model_name",model_name) # Create model dt = DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes) print("Model:\n ", dt) # Fit and predict dt.fit(self.X_train, self.y_train) predictions = dt.predict(self.X_test) signature = infer_signature(self.X_train, predictions) if self.save_signature else None print("signature:",signature) # MLflow params print("Parameters:") print(" max_depth:", max_depth) print(" max_leaf_nodes:", max_leaf_nodes) if not self.autolog: mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_leaf_nodes", max_leaf_nodes) # MLflow metrics rmse = np.sqrt(mean_squared_error(self.y_test, predictions)) mae = mean_absolute_error(self.y_test, predictions) r2 = r2_score(self.y_test, predictions) print("Metrics:") print(" rmse:", rmse) print(" mae:", mae) print(" r2:", r2) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) # MLflow log model - autolog creates a model called "model" mlflow.sklearn.log_model(dt, "sklearn-model", registered_model_name=model_name, signature=signature) # Convert sklearn model to ONNX and log model if self.log_as_onnx: from wine_quality import onnx_utils onnx_utils.log_model(dt, "onnx-model", model_name, self.X_test) # MLflow artifact - plot file plot_file = "plot.png" plot_utils.create_plot_file(self.y_test, predictions, plot_file) mlflow.log_artifact(plot_file) # Write run ID to file if (output_path): mlflow.set_tag("output_path", output_path) output_path = output_path.replace("dbfs:","/dbfs") with open(output_path, "w") as f: f.write(run_id) return (experiment_id,run_id)
import pandas as pd from sklearn import datasets from sklearn.ensemble import RandomForestClassifier import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature iris = datasets.load_iris() iris_train = pd.DataFrame(iris.data, columns=iris.feature_names) clf = RandomForestClassifier(max_depth=7, random_state=0) clf.fit(iris_train, iris.target) signature = infer_signature(iris_train, clf.predict(iris_train)) mlflow.sklearn.log_model(clf, "iris_rf", signature=signature)
mlflow.log_param("min_impurity_decrease", "0.0") mlflow.log_param("min_impurity_split", "None") mlflow.log_param("min_samples_leaf", "1") mlflow.log_param("min_samples_split", "2") mlflow.log_param("min_weight_fraction_leaf", "0.0") mlflow.log_param("n_jobs", "None") mlflow.log_param("verbose", "0") mlflow.log_param("warm_start", "False") mlflow.set_tag("estimator_class", "sklearn.ensemble._forest.RandomForestRegressor") mlflow.set_tag("estimator_name", "RandomForestRegressor") mlflow.set_tag("sparkDatasourceInfo", "path=dbfs:/mnt/delta/flights/gold,version=4,format=delta") sig = infer_signature(X_train[:100], y_train[:100]) mlflow.sklearn.log_model( rfr, 'model_signature', signature=sig, input_example=X_train.head(10), registered_model_name= '2020-10-27_clemens_mewald@databricks_com_based on clemens_flightdelays_gold' ) import shap shap_values = shap.TreeExplainer(rfr).shap_values(X_train[:10]) shap_plt = shap.summary_plot(shap_values, X_train[:10], plot_type="bar", show=False)
from mlflow.models.signature import infer_signature plt.close() with mlflow.start_run() as run: best_iteration = int( spark_trials.best_trial['result']['booster']['best_iteration']) booster = xgb.train(params=params_to_xgb(best_params), dtrain=xgb.DMatrix(data=X, label=y), num_boost_round=best_iteration) mlflow.log_params(best_params) mlflow.log_param('best_iteration', best_iteration) mlflow.xgboost.log_model(booster, "xgboost", input_example=X.head(), signature=infer_signature(X, y)) shap_values = shap.TreeExplainer(booster).shap_values(X, y=y) shap.summary_plot(shap_values, X, feature_names=display_cols, plot_size=(14, 6), max_display=10, show=False) plt.savefig("summary_plot.png", bbox_inches="tight") plt.close() mlflow.log_artifact("summary_plot.png") best_run = run.info # COMMAND ----------
# COMMAND ---------- from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv]) # COMMAND ---------- # Split the dataset randomly into 70% for training and 30% for testing. Passing a seed for deterministic behavior train, test = silverDepDF_1.randomSplit([0.7, 0.3], seed=0) print("Departure Delay: There are %d training examples and %d test examples." % (train.count(), test.count())) # COMMAND ---------- from mlflow.models.signature import infer_signature signature = infer_signature(train.drop("DEP_DELAY"), train.select("DEP_DELAY")) # COMMAND ---------- signature # COMMAND ---------- display(silverDepDF_1) # COMMAND ---------- import mlflow import mlflow.spark from mlflow.models.signature import infer_signature # turn on autologging