def test_custom_model_save_load(custom_model, custom_layer, data, custom_predicted, model_path): x, _ = data custom_objects = {"MyDense": custom_layer} mlflow.keras.save_model(custom_model, model_path, custom_objects=custom_objects) # Loading Keras model model_loaded = mlflow.keras.load_model(model_path) assert all(model_loaded.predict(x) == custom_predicted) # pyfunc serve scoring_response = pyfunc_serve_and_score_model( model_uri=os.path.abspath(model_path), data=pd.DataFrame(x), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, ) assert np.allclose( pd.read_json(scoring_response.content, orient="records", encoding="utf8").values.astype(np.float32), custom_predicted, rtol=1e-5, atol=1e-9, ) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_model(model_path) assert all(pyfunc_loaded.predict(x).values == custom_predicted) # test spark udf spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path), pandas_df=pd.DataFrame(x), result_type="float") np.allclose(np.array(spark_udf_preds), custom_predicted.reshape(len(spark_udf_preds)))
def test_model_save_load(model, model_path, data, predicted): x, y = data mlflow.keras.save_model(model, model_path) # Loading Keras model model_loaded = mlflow.keras.load_model(model_path) assert all(model_loaded.predict(x) == predicted) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_pyfunc(model_path) assert all(pyfunc_loaded.predict(x).values == predicted) # pyfunc serve scoring_response = pyfunc_serve_and_score_model( model_path=os.path.abspath(model_path), data=pd.DataFrame(x), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED) assert all(pd.read_json(scoring_response.content, orient="records").values.astype(np.float32) == predicted) # test spark udf spark_udf_preds = score_model_as_udf(os.path.abspath(model_path), run_id=None, pandas_df=pd.DataFrame(x), result_type="float") np.testing.assert_array_almost_equal( np.array(spark_udf_preds), predicted.reshape(len(spark_udf_preds)), decimal=6)
def test_model_save_load(build_model, model_path, data): x, _ = data keras_model = build_model(data) if build_model == tf_keras_model: model_path = os.path.join(model_path, "tf") else: model_path = os.path.join(model_path, "plain") expected = keras_model.predict(x) mlflow.keras.save_model(keras_model, model_path) # Loading Keras model model_loaded = mlflow.keras.load_model(model_path) assert type(keras_model) == type(model_loaded) assert all(expected == model_loaded.predict(x)) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_model(model_path) assert all(pyfunc_loaded.predict(x).values == expected) # pyfunc serve scoring_response = pyfunc_serve_and_score_model( model_uri=os.path.abspath(model_path), data=pd.DataFrame(x), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED) assert all( pd.read_json(scoring_response.content, orient="records", encoding="utf8").values.astype(np.float32) == expected) # test spark udf spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path), pandas_df=pd.DataFrame(x), result_type="float") np.allclose(np.array(spark_udf_preds), expected.reshape(len(spark_udf_preds)))
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() feature_names = ["0", "1", "2", "3"] pandas_df = pd.DataFrame(iris.data, columns=feature_names) # to make spark_udf work pandas_df['label'] = pd.Series(iris.target) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()] assert preds1 == preds3 # test spar_udf preds4 = score_model_as_udf(artifact_path, run_id, pandas_df) assert preds1 == preds4 # We expect not to delete the DFS tempdir. x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert os.listdir(x) shutil.rmtree(x) finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_model_log_with_sparkml_format(tmpdir, spark_model_iris): # Print the coefficients and intercept for multinomial logistic regression preds_df = spark_model_iris.model.transform(spark_model_iris.training_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=spark_model_iris.model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(spark_model_iris.inference_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df_1 = reloaded_model.transform( spark_model_iris.training_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 # test spark_udf preds4 = score_model_as_udf(artifact_path, run_id, spark_model_iris.inference_df) assert preds1 == preds4 # We expect not to delete the DFS tempdir. x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert os.listdir(x) shutil.rmtree(x) finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_model_export(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP)
def test_model_save_load(build_model, save_format, model_path, data): x, _ = data keras_model = build_model(data) if build_model == get_tf_keras_model: model_path = os.path.join(model_path, "tf") else: model_path = os.path.join(model_path, "plain") expected = keras_model.predict(x.values) kwargs = {"save_format": save_format} if save_format else {} mlflow.keras.save_model(keras_model, model_path, **kwargs) # Loading Keras model model_loaded = mlflow.keras.load_model(model_path) # When saving as SavedModel, we actually convert the model # to a slightly different format, so we cannot assume it is # exactly the same. if save_format != "tf": assert type(keras_model) == type(model_loaded) np.testing.assert_allclose(model_loaded.predict(x.values), expected, rtol=1e-5) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_model(model_path) np.testing.assert_allclose(pyfunc_loaded.predict(x).values, expected, rtol=1e-5) # pyfunc serve scoring_response = pyfunc_serve_and_score_model( model_uri=os.path.abspath(model_path), data=pd.DataFrame(x), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, extra_args=EXTRA_PYFUNC_SERVING_TEST_ARGS, ) print(scoring_response.content) actual_scoring_response = pd.read_json( scoring_response.content.decode("utf-8"), orient="records", encoding="utf8").values.astype(np.float32) np.testing.assert_allclose(actual_scoring_response, expected, rtol=1e-5) # test spark udf spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path), pandas_df=pd.DataFrame(x), result_type="float") np.allclose(np.array(spark_udf_preds), expected.reshape(len(spark_udf_preds)))
def test_model_export(spark_model_iris, model_path, spark_conda_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(path=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_path, run_id=None, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)