def test_model_deployment(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env, # Test both spark ml and mleap sample_input=spark_model_iris.spark_df) # 1. score and compare pyfunc deployed in Sagemaker docker container scoring_response_1 = score_model_in_sagemaker_docker_container( model_uri=model_path, data=spark_model_iris.pandas_df, content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, flavor=kiwi.pyfunc.FLAVOR_NAME) np.testing.assert_array_almost_equal( spark_model_iris.predictions, np.array(json.loads(scoring_response_1.content)), decimal=4) # 2. score and compare mleap deployed in Sagemaker docker container scoring_response_2 = score_model_in_sagemaker_docker_container( model_uri=model_path, data=spark_model_iris.pandas_df.to_json(orient="split"), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, flavor=kiwi.mleap.FLAVOR_NAME) np.testing.assert_array_almost_equal( spark_model_iris.predictions, np.array(json.loads(scoring_response_2.content)), decimal=4)
def test_mleap_module_model_save_with_invalid_sample_input_type_raises_exception( spark_model_iris, model_path): with pytest.raises(Exception): invalid_input = pd.DataFrame() sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, sample_input=invalid_input)
def test_transformer_model_export(spark_model_transformer, model_path, spark_custom_env): with pytest.raises(MlflowException) as e: sparkm.save_model( spark_model_transformer.model, path=model_path, conda_env=spark_custom_env) assert "Cannot serialize this model" in e.value.message
def test_sparkml_model_save_without_specified_conda_env_uses_default_env_with_expected_dependencies( spark_model_iris, model_path): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, conda_env=None) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) with open(conda_env_path, "r") as f: conda_env = yaml.safe_load(f) assert conda_env == sparkm.get_default_conda_env()
def test_estimator_model_export(spark_model_estimator, model_path, spark_custom_env): sparkm.save_model(spark_model_estimator.model, path=model_path, conda_env=spark_custom_env) # score and compare the reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_estimator.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_estimator.predictions == preds # 2. score and compare reloaded pyfunc m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_estimator.spark_df.toPandas()) assert spark_model_estimator.predictions == preds2
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception( spark_model_iris, model_path): class CustomTransformer(JavaModel): def _transform(self, dataset): return dataset unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) with pytest.raises(ValueError): sparkm.save_model(spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df)
def test_sparkml_model_load_from_remote_uri_succeeds(spark_model_iris, model_path, mock_s3_bucket): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path) artifact_root = "s3://{bucket_name}".format(bucket_name=mock_s3_bucket) artifact_path = "model" artifact_repo = S3ArtifactRepository(artifact_root) artifact_repo.log_artifacts(model_path, artifact_path=artifact_path) model_uri = artifact_root + "/" + artifact_path reloaded_model = sparkm.load_model(model_uri=model_uri) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds
def test_sparkml_model_save_accepts_conda_env_as_dict(spark_model_iris, model_path): conda_env = dict(kiwi.spark.get_default_conda_env()) conda_env["dependencies"].append("pytest") sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, conda_env=conda_env) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) assert os.path.exists(saved_conda_env_path) with open(saved_conda_env_path, "r") as f: saved_conda_env_parsed = yaml.safe_load(f) assert saved_conda_env_parsed == conda_env
def test_sagemaker_docker_model_scoring_with_default_conda_env(spark_model_iris, model_path): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=None) scoring_response = score_model_in_sagemaker_docker_container( model_uri=model_path, data=spark_model_iris.pandas_df, content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, flavor=kiwi.pyfunc.FLAVOR_NAME) deployed_model_preds = np.array(json.loads(scoring_response.content)) np.testing.assert_array_almost_equal( deployed_model_preds, spark_model_iris.predictions, decimal=4)
def test_spark_module_model_save_with_relative_path_and_valid_sample_input_produces_mleap_flavor( spark_model_iris): with TempDir(chdr=True) as tmp: model_path = os.path.basename(tmp.path("model")) mlflow_model = Model() sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, sample_input=spark_model_iris.spark_df, mlflow_model=mlflow_model) assert mleap.FLAVOR_NAME in mlflow_model.flavors config_path = os.path.join(model_path, "MLmodel") assert os.path.exists(config_path) config = Model.load(config_path) assert mleap.FLAVOR_NAME in config.flavors
def test_sparkml_model_save_persists_specified_conda_env_in_mlflow_model_directory( spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, conda_env=spark_custom_env) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) assert os.path.exists(saved_conda_env_path) assert saved_conda_env_path != spark_custom_env with open(spark_custom_env, "r") as f: spark_custom_env_parsed = yaml.safe_load(f) with open(saved_conda_env_path, "r") as f: saved_conda_env_parsed = yaml.safe_load(f) assert saved_conda_env_parsed == spark_custom_env_parsed
def test_model_export(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP)
def test_model_export_with_signature_and_examples(iris_df, spark_model_iris): _, _, iris_spark_df = iris_df signature_ = infer_signature(iris_spark_df) example_ = iris_spark_df.toPandas().head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") sparkm.save_model(spark_model_iris.model, path=path, signature=signature, input_example=example) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())