def test_mleap_model_log(spark_model_iris):
    artifact_path = "model"
    register_model_patch = mock.patch("mlflow.register_model")
    with mlflow.start_run(), register_model_patch:
        sparkm.log_model(
            spark_model=spark_model_iris.model,
            sample_input=spark_model_iris.spark_df,
            artifact_path=artifact_path,
            registered_model_name="Model1",
        )
        model_uri = "runs:/{run_id}/{artifact_path}".format(
            run_id=mlflow.active_run().info.run_id,
            artifact_path=artifact_path)
        mlflow.register_model.assert_called_once_with(
            model_uri,
            "Model1",
            await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS)

    model_path = _download_artifact_from_uri(artifact_uri=model_uri)
    config_path = os.path.join(model_path, "MLmodel")
    mlflow_model = Model.load(config_path)
    assert sparkm.FLAVOR_NAME in mlflow_model.flavors
    assert mleap.FLAVOR_NAME in mlflow_model.flavors
def test_log_model_with_signature_and_examples(iris_df, spark_model_iris):
    _, _, iris_spark_df = iris_df
    signature_ = infer_signature(iris_spark_df)
    example_ = iris_spark_df.toPandas().head(3)
    artifact_path = "model"
    for signature in (None, signature_):
        for example in (None, example_):
            with mlflow.start_run():
                sparkm.log_model(
                    spark_model_iris.model,
                    artifact_path=artifact_path,
                    signature=signature,
                    input_example=example,
                )
                artifact_uri = mlflow.get_artifact_uri()
                model_path = os.path.join(artifact_uri, artifact_path)
                mlflow_model = Model.load(model_path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model,
                                              model_path) == example).all())
def test_sparkml_model_log_persists_specified_conda_env_in_mlflow_model_directory(
        spark_model_iris, model_path, spark_custom_env):
    artifact_path = "model"
    with mlflow.start_run():
        sparkm.log_model(spark_model=spark_model_iris.model,
                         artifact_path=artifact_path,
                         conda_env=spark_custom_env)
        model_uri = "runs:/{run_id}/{artifact_path}".format(
            run_id=mlflow.active_run().info.run_id,
            artifact_path=artifact_path)

    model_path = _download_artifact_from_uri(artifact_uri=model_uri)
    pyfunc_conf = _get_flavor_configuration(model_path=model_path,
                                            flavor_name=pyfunc.FLAVOR_NAME)
    saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV])
    assert os.path.exists(saved_conda_env_path)
    assert saved_conda_env_path != spark_custom_env

    with open(spark_custom_env, "r") as f:
        spark_custom_env_parsed = yaml.safe_load(f)
    with open(saved_conda_env_path, "r") as f:
        saved_conda_env_parsed = yaml.safe_load(f)
    assert saved_conda_env_parsed == spark_custom_env_parsed
def test_sparkml_model_log(tmpdir, spark_model_iris):
    # Print the coefficients and intercept for multinomial logistic regression
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path,
                                 spark_model=spark_model_iris.model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_id
                # test reloaded model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df = reloaded_model.transform(spark_model_iris.spark_df)
                preds = [
                    x.prediction
                    for x in preds_df.select("prediction").collect()
                ]
                assert spark_model_iris.predictions == preds
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                x = dfs_tmp_dir or sparkm.DFS_TMP
                shutil.rmtree(x)
                shutil.rmtree(tracking_dir)
Example #5
0
def test_sparkml_model_log_invalid_args(spark_model_transformer, model_path):
    # pylint: disable=unused-argument
    with pytest.raises(MlflowException) as e:
        sparkm.log_model(spark_model=spark_model_transformer.model,
                         artifact_path="model0")
    assert "Cannot serialize this model" in e.value.message
Example #6
0
def test_sparkml_model_log_invalid_args(spark_model_iris, model_path):
    with pytest.raises(MlflowException) as e:
        sparkm.log_model(spark_model=spark_model_iris.model.stages[0],
                         artifact_path="model0")
        assert e.message.contains("SparkML can only save PipelineModels")
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data,
                             columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                tracking.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    tracking.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                if dfs_tmp_dir:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model,
                                     dfs_tmpdir=dfs_tmp_dir)
                else:
                    sparkm.log_model(artifact_path=artifact_path,
                                     spark_model=model)
                run_id = tracking.active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [
                    x.prediction
                    for x in preds_df_1.select("prediction").collect()
                ]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # make sure we did not leave any temp files behind
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert not os.listdir(x)
                shutil.rmtree(x)
            finally:
                tracking.end_run()
                tracking.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
Example #8
0
def model_selection_via_crossvalidation(num_features, reg_param, net_param,
                                        cv_num_folds):
    # hyper parameters for the cross validator
    num_features = num_features
    reg_param = reg_param
    net_param = net_param
    cv_num_folds = cv_num_folds

    # Start a new MLflow run
    with mlflow.start_run():
        tokenizer, remover, counts, lr = build_ml_pipeline()
        pipeline = Pipeline().setStages([tokenizer, remover, counts, lr])
        evaluator = BinaryClassificationEvaluator(
            rawPredictionCol="rawPrediction")

        paramGrid = ParamGridBuilder() \
            .addGrid(counts.numFeatures, num_features) \
            .addGrid(lr.regParam, reg_param) \
            .addGrid(lr.elasticNetParam, net_param) \
            .build()

        crossval = CrossValidator(estimator=pipeline,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=cv_num_folds)

        # Run cross-validation, and choose the best set of parameters.
        training_df, validate_df, test_df = prepare_data()
        logger.warn("training classifier")
        cv_model = crossval.fit(training_df)

        # cv_best_pipeline_model == Pipeline model, this holds the best pipeline model based out of cross validation run
        cv_best_pipeline_model = cv_model.bestModel

        logger.info("evaluate trained classifier")
        prediction = cv_model.transform(validate_df)
        prediction.show(n=10)

        area_under_ROC = evaluator.evaluate(prediction)
        logger.info(
            "Area under the curve metric for the best model selected out of CV: "
            + str(area_under_ROC))
        print("\n area_under_ROC: " + str(area_under_ROC))

        accuracy = cv_best_pipeline_model.stages[-1].summary.accuracy
        logger.info("Accuracy metric for the best model selected out of CV: " +
                    str(accuracy))
        print("\n accuracy: " + str(accuracy))

        # save trained model to a local directory, in this case under your local system /uap/nlp/
        mlflow_spark.save_model(cv_best_pipeline_model,
                                path="pyfunc-cv-model",
                                conda_env=None)

        # save trained model to a dbfs
        mlflow_spark.log_model(
            cv_best_pipeline_model,
            artifact_path=
            "/dbfs/tmp/dbconnect-demo/uap/reviews/pyfunc-cv-model",
            conda_env=None)

        # save model as spark flavor
        logger.info(
            "logging cv_best_pipeline_model as a spark flavor on hosted mlflow server"
        )
        spark_cv_model_path = "spark-cv-model"
        mlflow_spark.log_model(cv_best_pipeline_model, spark_cv_model_path)

        # save model as mleap flavor
        # mleap_cv_model_path = "mleap-cv-model"
        # mlflow.mleap.log_model(cv_best_pipeline_model, test_df, mleap_cv_model_path)

        mlflow.log_param(
            "max_iterations",
            cv_best_pipeline_model.stages[-1]._java_obj.getMaxIter())
        mlflow.log_param(
            "reg_param",
            cv_best_pipeline_model.stages[-1]._java_obj.getRegParam())

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("area_under_ROC", area_under_ROC)

        cv_runid = mlflow.active_run().info.run_uuid
        cv_artifactUri = mlflow.get_artifact_uri()

        logger.warn("\ncv_runid: " + str(cv_runid))
        logger.warn("\ncv_artifactUri: " + str(cv_artifactUri))

        return cv_runid, cv_artifactUri
Example #9
0
def run(max_depth, max_bins):
    print("max_depth={} max_bins={}".format(max_depth, max_bins))
    spark = SparkSession.builder.appName(
        "DecisionTreeClassificationExample").getOrCreate()

    # Load the data stored in LIBSVM format as a DataFrame.
    dir = str(os.environ['SPARK_HOME'])
    path = os.path.join(dir, "data/mllib/sample_libsvm_data.txt")
    data = spark.read.format("libsvm").load(path)

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4).fit(data)

    # Split the data into training and test sets
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_bins", max_bins)
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                maxDepth=max_depth,
                                maxBins=max_bins)

    # Chain indexers and tree in a Pipeline.
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error.
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    test_error = 1.0 - accuracy
    print("Test Error = {} ".format(test_error))

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("test_error", test_error)

    treeModel = model.stages[2]
    print(treeModel)

    mlflow_spark.log_model(model, "spark-model")

    spark.stop()
Example #10
0
def train(inputs_path: str):

    spark = SparkUtils.build_or_get_session('training')
    df_kids = spark.read.parquet(inputs_path)
    label_col = 'final_status'

    mlflow_tracking_ui = 'http://35.246.84.226'
    mlflow_experiment_name = 'kickstarter'

    mlflow.set_tracking_uri(mlflow_tracking_ui)
    mlflow.set_experiment(experiment_name=mlflow_experiment_name)

    numerical_columns = ['days_campaign', 'hours_prepa', 'goal']
    categorical_columns = ['country_clean', 'currency_clean']
    features = numerical_columns + categorical_columns

    df = df_kids.select(features + [label_col])

    max_iter = 15
    model_specs: Pipeline = build_model(
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
        label_col=label_col,
        max_iter=max_iter)

    df_train, df_test = df.randomSplit([0.8, 0.2], seed=12345)
    df_train = df_train.cache()

    evaluator = BinaryClassificationEvaluator() \
        .setMetricName('areaUnderROC') \
        .setRawPredictionCol('rawPrediction') \
        .setLabelCol('final_status')

    gbt = model_specs.getStages()[-1]

    params_grid = ParamGridBuilder()\
        .addGrid(gbt.maxDepth, [6]) \
        .addGrid(gbt.maxIter, [15]) \
        .addGrid(gbt.maxBins, [32])\
        .build()

    cross_val = CrossValidator(estimator=model_specs,
                               estimatorParamMaps=params_grid,
                               evaluator=evaluator,
                               numFolds=2)

    with mlflow.start_run() as active_run:
        logger.info(f'Cross evaluating model on {df_train.count()} lines')

        cross_val_model: CrossValidatorModel = cross_val.fit(df_train)
        model = cross_val_model.bestModel

        logger.info('Evaluating model')
        train_metrics = evaluator.evaluate(model.transform(df_train))
        metrics = {'train_auc': train_metrics}

        test_metrics = evaluator.evaluate(model.transform(df_test))
        metrics.update({'test_auc': test_metrics})
        logger.info(f'Model metrics: {metrics}')

        logger.info('Logging to mlflow')
        mlflow_params = {'model_class': 'gbt', 'max_iter': max_iter}
        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(metrics)
        log_model(model, 'model')
        model_uri = mlflow.get_artifact_uri(artifact_path='model')
        logger.info(f'Model successfully trained and saved @ {model_uri}')