def test_spark_udf(spark, model_path): mlflow.pyfunc.save_model( path=model_path, loader_module=__name__, code_path=[os.path.dirname(tests.__file__)], ) with mock.patch("mlflow.pyfunc._warn_dependency_requirement_mismatches" ) as mock_check_fn: reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path) mock_check_fn.assert_called_once() pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = spark.createDataFrame(pandas_df) # Test all supported return types type_map = { "float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None), } for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = reloaded_pyfunc_model.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [ list(row[1]) if is_array else row[1][0] for row in expected.iterrows() ] pyfunc_udf = spark_udf(spark, model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()["prediction"]) assert expected == actual if not is_array: pyfunc_udf = spark_udf(spark, model_path, result_type=tname) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list( new_df.select("prediction").toPandas()["prediction"]) assert expected == actual
def _model_udf(self) -> Any: from mlflow import pyfunc spark = default_session() return pyfunc.spark_udf(spark, model_uri=self._model_uri, result_type=self._return_type)
def main(): parser = argparse.ArgumentParser(description="Deploy and test batch model") parser.add_argument("-m", "--model_name", help="Model name", required=True) parser.add_argument("-r", "--root_path", help="Prefix path", required=True) parser.add_argument("-s", "--stage", help="Stage", default="staging", required=True) parser.add_argument("-d", "--db_name", help="Output Database name", default="wine", required=False) parser.add_argument("-t", "--table_name", help="Output Table name", default="mlops_wine_quality_regression", required=False) # parser.add_argument("-p", "--phase", help="Phase", default="qa", required=True) args = parser.parse_args() model_name = args.model_name home = args.root_path stage = args.stage db = args.db_name.replace("@", "_").replace(".", "_") ml_output_predictions_table = args.table_name # phase = args.phase temp_data_path = f"/dbfs/tmp/mlflow-wine-quality.csv" data_uri = "https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv" dbfs_wine_data_path = download_wine_file(data_uri, home, temp_data_path) wine_df = spark.read.format("csv").option( "header", "true").load(dbfs_wine_data_path).drop("quality").cache() wine_df = wine_df.select( *(col(column).cast("float").alias(column.replace(" ", "_")) for column in wine_df.columns)) data_spark = wine_df # wine_data_path = dbfs_wine_data_path.replace("dbfs:", "/dbfs") client = mlflow.tracking.MlflowClient() latest_model = client.get_latest_versions(name=model_name, stages=[stage]) print(f"Latest Model: {latest_model}") model_uri = "runs:/{}/model".format(latest_model[0].run_id) print(f"model_uri: {model_uri}") udf = pyfunc.spark_udf(spark, model_uri) # data_spark = spark.read.csv(dbfs_wine_data_path, header=True) predictions = data_spark.select( udf(*data_spark.columns).alias('prediction'), "*") spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}") spark.sql(f"DROP TABLE IF EXISTS {db}.{ml_output_predictions_table}") predictions.write.format("delta").mode("overwrite").saveAsTable( f"{db}.{ml_output_predictions_table}") output = json.dumps({"model_name": model_name, "model_uri": model_uri}) print(output)
def test_spark_udf(self): pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = self.spark.createDataFrame(pandas_df) # Test all supported return types type_map = { "float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None) } for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = ConstPyfunc.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [ list(row[1]) if is_array else row[1][0] for row in expected.iterrows() ] pyfunc_udf = spark_udf(self.spark, self._model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list( new_df.select("prediction").toPandas()['prediction']) assert expected == actual if not is_array: pyfunc_udf = spark_udf(self.spark, self._model_path, result_type=tname) new_df = spark_df.withColumn( "prediction", pyfunc_udf(*pandas_df.columns)) actual = list( new_df.select("prediction").toPandas()['prediction']) assert expected == actual
def score_model_as_udf(model_uri, pandas_df, result_type="double"): spark = get_spark_session(pyspark.SparkConf()) spark_df = spark.createDataFrame(pandas_df) pyfunc_udf = spark_udf(spark=spark, model_uri=model_uri, result_type=result_type) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) return [x["prediction"] for x in new_df.collect()]
def score_model_as_udf(model_path, run_id, pandas_df, result_type="double"): spark = pyspark.sql.SparkSession.builder \ .config(key="spark.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark.createDataFrame(pandas_df) pyfunc_udf = spark_udf(spark, model_path, run_id, result_type=result_type) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) return [x['prediction'] for x in new_df.collect()]
def test_spark_udf_env_manager_predict_sklearn_model(spark, sklearn_model, model_path, env_manager): model, inference_data = sklearn_model mlflow.sklearn.save_model(model, model_path) expected_pred_result = model.predict(inference_data) infer_data = pd.DataFrame(inference_data, columns=["a", "b"]) infer_spark_df = spark.createDataFrame(infer_data) pyfunc_udf = spark_udf(spark, model_path, env_manager=env_manager) result = (infer_spark_df.select(pyfunc_udf( "a", "b").alias("predictions")).toPandas().predictions.to_numpy()) np.testing.assert_allclose(result, expected_pred_result, rtol=1e-5)
def test_spark_udf(self): pandas_df = self._pandas_df spark_df = self.spark.createDataFrame(pandas_df) pyfunc_udf = spark_udf(self.spark, self._model_path, result_type="integer") new_df = spark_df.withColumn("prediction", pyfunc_udf(*self._pandas_df.columns)) spark_results = new_df.collect() # Compare against directly running the model. direct_model = load_pyfunc(self._model_path) pandas_results = direct_model.predict(pandas_df) self.assertEqual(178, len(pandas_results)) self.assertEqual(178, len(spark_results)) for i in range(0, len(pandas_results)): # noqa self.assertEqual(self._predict[i], pandas_results[i]) self.assertEqual(pandas_results[i], spark_results[i]['prediction'])
def _model_udf(self): spark = default_session() return pyfunc.spark_udf(spark, model_uri=self._model_uri, result_type=self._return_type)
import pyspark from pyspark.sql.types import StringType from mlflow.pyfunc import spark_udf if __name__ == '__main__': spark = pyspark.sql.SparkSession.builder.getOrCreate() spark_df = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], [str(1), str(2)]) pyfunc_udf = spark_udf(spark, "spark-model", "f2ccde5b33ce456d973ce9f91de8cadf", result_type=StringType()) new_df = spark_df.withColumn("prediction", pyfunc_udf(str(1), str(2))) new_df.show()
import pyspark import os from pyspark.sql.types import DoubleType from sklearn.model_selection import train_test_split from mlflow.pyfunc import spark_udf import pandas as pd if __name__ == '__main__': spark = pyspark.sql.SparkSession.builder.getOrCreate() # Read the wine-quality csv file (make sure you're running this from the root of MLflow!) wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv") data = pd.read_csv(wine_path) # Split the data into training and test sets. (0.75, 0.25) split. train, test = train_test_split(data) # The predicted column is "quality" which is a scalar from [3, 9] test_y = test[["quality"]] pdf = pd.DataFrame(test_y) spark_df = spark.createDataFrame(pdf) pyfunc_udf = spark_udf(spark, "model", "3774808880c14057abcc89106caa70f9", result_type=DoubleType()) new_df = spark_df.withColumn("prediction", pyfunc_udf("quality")) new_df.show()