def test_mleap_model_log(spark_model_iris): artifact_path = "model" register_model_patch = mock.patch("mlflow.register_model") with mlflow.start_run(), register_model_patch: sparkm.log_model( spark_model=spark_model_iris.model, sample_input=spark_model_iris.spark_df, artifact_path=artifact_path, registered_model_name="Model1", ) model_uri = "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path) mlflow.register_model.assert_called_once_with( model_uri, "Model1", await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS) model_path = _download_artifact_from_uri(artifact_uri=model_uri) config_path = os.path.join(model_path, "MLmodel") mlflow_model = Model.load(config_path) assert sparkm.FLAVOR_NAME in mlflow_model.flavors assert mleap.FLAVOR_NAME in mlflow_model.flavors
def test_log_model_with_signature_and_examples(iris_df, spark_model_iris): _, _, iris_spark_df = iris_df signature_ = infer_signature(iris_spark_df) example_ = iris_spark_df.toPandas().head(3) artifact_path = "model" for signature in (None, signature_): for example in (None, example_): with mlflow.start_run(): sparkm.log_model( spark_model_iris.model, artifact_path=artifact_path, signature=signature, input_example=example, ) artifact_uri = mlflow.get_artifact_uri() model_path = os.path.join(artifact_uri, artifact_path) mlflow_model = Model.load(model_path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, model_path) == example).all())
def test_sparkml_model_log_persists_specified_conda_env_in_mlflow_model_directory( spark_model_iris, model_path, spark_custom_env): artifact_path = "model" with mlflow.start_run(): sparkm.log_model(spark_model=spark_model_iris.model, artifact_path=artifact_path, conda_env=spark_custom_env) model_uri = "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path) model_path = _download_artifact_from_uri(artifact_uri=model_uri) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) assert os.path.exists(saved_conda_env_path) assert saved_conda_env_path != spark_custom_env with open(spark_custom_env, "r") as f: spark_custom_env_parsed = yaml.safe_load(f) with open(saved_conda_env_path, "r") as f: saved_conda_env_parsed = yaml.safe_load(f) assert saved_conda_env_parsed == spark_custom_env_parsed
def test_sparkml_model_log(tmpdir, spark_model_iris): # Print the coefficients and intercept for multinomial logistic regression old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.join("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=spark_model_iris.model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_id # test reloaded model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds = [ x.prediction for x in preds_df.select("prediction").collect() ] assert spark_model_iris.predictions == preds finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) x = dfs_tmp_dir or sparkm.DFS_TMP shutil.rmtree(x) shutil.rmtree(tracking_dir)
def test_sparkml_model_log_invalid_args(spark_model_transformer, model_path): # pylint: disable=unused-argument with pytest.raises(MlflowException) as e: sparkm.log_model(spark_model=spark_model_transformer.model, artifact_path="model0") assert "Cannot serialize this model" in e.value.message
def test_sparkml_model_log_invalid_args(spark_model_iris, model_path): with pytest.raises(MlflowException) as e: sparkm.log_model(spark_model=spark_model_iris.model.stages[0], artifact_path="model0") assert e.message.contains("SparkML can only save PipelineModels")
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() feature_names = ["0", "1", "2", "3"] pandas_df = pd.DataFrame(iris.data, columns=feature_names) # to make spark_udf work pandas_df['label'] = pd.Series(iris.target) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() artifact_path = "model%d" % cnt cnt += 1 if dfs_tmp_dir: sparkm.log_model(artifact_path=artifact_path, spark_model=model, dfs_tmpdir=dfs_tmp_dir) else: sparkm.log_model(artifact_path=artifact_path, spark_model=model) run_id = tracking.active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 # test spar_udf preds4 = score_model_as_udf(artifact_path, run_id, pandas_df) assert preds1 == preds4 # make sure we did not leave any temp files behind x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert not os.listdir(x) shutil.rmtree(x) finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def model_selection_via_crossvalidation(num_features, reg_param, net_param, cv_num_folds): # hyper parameters for the cross validator num_features = num_features reg_param = reg_param net_param = net_param cv_num_folds = cv_num_folds # Start a new MLflow run with mlflow.start_run(): tokenizer, remover, counts, lr = build_ml_pipeline() pipeline = Pipeline().setStages([tokenizer, remover, counts, lr]) evaluator = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction") paramGrid = ParamGridBuilder() \ .addGrid(counts.numFeatures, num_features) \ .addGrid(lr.regParam, reg_param) \ .addGrid(lr.elasticNetParam, net_param) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=cv_num_folds) # Run cross-validation, and choose the best set of parameters. training_df, validate_df, test_df = prepare_data() logger.warn("training classifier") cv_model = crossval.fit(training_df) # cv_best_pipeline_model == Pipeline model, this holds the best pipeline model based out of cross validation run cv_best_pipeline_model = cv_model.bestModel logger.info("evaluate trained classifier") prediction = cv_model.transform(validate_df) prediction.show(n=10) area_under_ROC = evaluator.evaluate(prediction) logger.info( "Area under the curve metric for the best model selected out of CV: " + str(area_under_ROC)) print("\n area_under_ROC: " + str(area_under_ROC)) accuracy = cv_best_pipeline_model.stages[-1].summary.accuracy logger.info("Accuracy metric for the best model selected out of CV: " + str(accuracy)) print("\n accuracy: " + str(accuracy)) # save trained model to a local directory, in this case under your local system /uap/nlp/ mlflow_spark.save_model(cv_best_pipeline_model, path="pyfunc-cv-model", conda_env=None) # save trained model to a dbfs mlflow_spark.log_model( cv_best_pipeline_model, artifact_path= "/dbfs/tmp/dbconnect-demo/uap/reviews/pyfunc-cv-model", conda_env=None) # save model as spark flavor logger.info( "logging cv_best_pipeline_model as a spark flavor on hosted mlflow server" ) spark_cv_model_path = "spark-cv-model" mlflow_spark.log_model(cv_best_pipeline_model, spark_cv_model_path) # save model as mleap flavor # mleap_cv_model_path = "mleap-cv-model" # mlflow.mleap.log_model(cv_best_pipeline_model, test_df, mleap_cv_model_path) mlflow.log_param( "max_iterations", cv_best_pipeline_model.stages[-1]._java_obj.getMaxIter()) mlflow.log_param( "reg_param", cv_best_pipeline_model.stages[-1]._java_obj.getRegParam()) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("area_under_ROC", area_under_ROC) cv_runid = mlflow.active_run().info.run_uuid cv_artifactUri = mlflow.get_artifact_uri() logger.warn("\ncv_runid: " + str(cv_runid)) logger.warn("\ncv_artifactUri: " + str(cv_artifactUri)) return cv_runid, cv_artifactUri
def run(max_depth, max_bins): print("max_depth={} max_bins={}".format(max_depth, max_bins)) spark = SparkSession.builder.appName( "DecisionTreeClassificationExample").getOrCreate() # Load the data stored in LIBSVM format as a DataFrame. dir = str(os.environ['SPARK_HOME']) path = os.path.join(dir, "data/mllib/sample_libsvm_data.txt") data = spark.read.format("libsvm").load(path) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_bins", max_bins) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=max_depth, maxBins=max_bins) # Chain indexers and tree in a Pipeline. pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error. evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) test_error = 1.0 - accuracy print("Test Error = {} ".format(test_error)) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("test_error", test_error) treeModel = model.stages[2] print(treeModel) mlflow_spark.log_model(model, "spark-model") spark.stop()
def train(inputs_path: str): spark = SparkUtils.build_or_get_session('training') df_kids = spark.read.parquet(inputs_path) label_col = 'final_status' mlflow_tracking_ui = 'http://35.246.84.226' mlflow_experiment_name = 'kickstarter' mlflow.set_tracking_uri(mlflow_tracking_ui) mlflow.set_experiment(experiment_name=mlflow_experiment_name) numerical_columns = ['days_campaign', 'hours_prepa', 'goal'] categorical_columns = ['country_clean', 'currency_clean'] features = numerical_columns + categorical_columns df = df_kids.select(features + [label_col]) max_iter = 15 model_specs: Pipeline = build_model( numerical_columns=numerical_columns, categorical_columns=categorical_columns, label_col=label_col, max_iter=max_iter) df_train, df_test = df.randomSplit([0.8, 0.2], seed=12345) df_train = df_train.cache() evaluator = BinaryClassificationEvaluator() \ .setMetricName('areaUnderROC') \ .setRawPredictionCol('rawPrediction') \ .setLabelCol('final_status') gbt = model_specs.getStages()[-1] params_grid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [6]) \ .addGrid(gbt.maxIter, [15]) \ .addGrid(gbt.maxBins, [32])\ .build() cross_val = CrossValidator(estimator=model_specs, estimatorParamMaps=params_grid, evaluator=evaluator, numFolds=2) with mlflow.start_run() as active_run: logger.info(f'Cross evaluating model on {df_train.count()} lines') cross_val_model: CrossValidatorModel = cross_val.fit(df_train) model = cross_val_model.bestModel logger.info('Evaluating model') train_metrics = evaluator.evaluate(model.transform(df_train)) metrics = {'train_auc': train_metrics} test_metrics = evaluator.evaluate(model.transform(df_test)) metrics.update({'test_auc': test_metrics}) logger.info(f'Model metrics: {metrics}') logger.info('Logging to mlflow') mlflow_params = {'model_class': 'gbt', 'max_iter': max_iter} mlflow.log_params(mlflow_params) mlflow.log_metrics(metrics) log_model(model, 'model') model_uri = mlflow.get_artifact_uri(artifact_path='model') logger.info(f'Model successfully trained and saved @ {model_uri}')