def score_data(abt_to_score, modelPath): ''' Function to score data :param abt_to_score: A pyspark DataFrame to score :param modelPath: The modelpath associated to .zip mleap flavor :return: scoredData ''' print('Scoring process starts...') deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:{}".format(modelPath)) scoredData = deserializedPipeline.transform(abt_to_score) return scoredData
# MAGIC %md Serialize to bundle and deserialize # COMMAND ---------- import mleap.pyspark from mleap.pyspark.spark_support import SimpleSparkSerializer model.serializeToBundle( "jar:file:/dbfs/mnt/nycitibike/spark-model/lr-spark-model.zip", sparkTransformed) # COMMAND ---------- from pyspark.ml import PipelineModel deserializedPipeline = PipelineModel.deserializeFromBundle( "jar:file:/dbfs/mnt/nycitibike/spark-model/lr-spark-model.zip") # COMMAND ---------- test_df = testData.limit(10) # COMMAND ---------- exampleResults = deserializedPipeline.transform(test_df) display(exampleResults) # COMMAND ---------- # MAGIC %md Register model # MAGIC # MAGIC https://www.mlflow.org/docs/latest/model-registry.html#registering-a-model
# MAGIC rm -rf /tmp/mleap_python_model_export # MAGIC mkdir /tmp/mleap_python_model_export # MAGIC rm -rf /FileStore/future.zip # save the model for future use. model.save("/home/sriram/ma_future") # this saves the pipeline using mleap. not using this but just wanted to demonstrate it. model.serializeToBundle("jar:file:/home/sriram/future4.zip", transformed) #dbutils.fs.cp("file:/tmp/mleap_python_model_export/future.zip", "file:/home/sriram/future.zip") #display(dbutils.fs.ls("dbfs:/FileStore")) # deserialize the model from the saved bundle. Again this is using mleap. deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:/home/sriram/future4.zip") new_model = PipelineModel.load("/home/sriram/ma_future") # time to test the model. # add your own comment and check. d = [{"Comment":"this sucks"}] df2 = spark.createDataFrame(d) df2.show() new_predictions = new_model.transform(df2) predictions = deserializedPipeline.transform(df2) abc = new_predictions.select("prediction") # get the prediction. print(abc.collect()[0].prediction)
def load_model(run, artifact_path): bundle_uri = f"{run.info.artifact_uri}/{artifact_path}" print("bundle_uri:", bundle_uri) return PipelineModel.deserializeFromBundle(bundle_uri)
def load_model_as_spark_bundle(run, artifact_path): bundle_uri = f"file:{run.info.artifact_uri}/" + artifact_path bundle_uri = bundle_uri.replace("dbfs:", "/dbfs") print("bundle_uri:", bundle_uri) return PipelineModel.deserializeFromBundle(bundle_uri)
from mleap.pyspark.spark_support import SimpleSparkSerializer # serialize the model to a local zip file in JSON format #model_name_export = "adult_census_pipeline.zip" model_name_path = cwd model_file = os.path.join(model_name_path, model_name_export) # remove an old model file, if needed. if os.path.isfile(model_file): os.remove(model_file) model_file_path = "jar:file:{}".format(model_file) model.serializeToBundle(model_file_path, model.transform(train)) ## import mleap model model_deserialized = PipelineModel.deserializeFromBundle(model_file_path) assert str(model_deserialized) == str(model) print("The deserialized model is ", model_deserialized) print("The deserialized model stages are", model_deserialized.stages) ############################################################################## ## export the final model with mleap ## remove the stringIndexer for the label column so it won't be required for prediction model_final = model.copy() si_label_index = -3 model_final.stages.pop(si_label_index) #si_label ## append an IndexToString transformer to the model pipeline to get the original labels
# COMMAND ---------- import mlflow client = mlflow.tracking.MlflowClient() run = client.get_run(run_id) run.info.artifact_uri # COMMAND ---------- mleap_path = "{}/mleap-model/mleap/model".format(run.info.artifact_uri) mleap_path = mleap_path.replace("dbfs:", "/dbfs") bundle_path = "file:" + mleap_path bundle_path # COMMAND ---------- from pyspark.ml import PipelineModel from mleap.pyspark.spark_support import SimpleSparkSerializer model = PipelineModel.deserializeFromBundle(bundle_path) predictions = model.transform(data) display(predictions.select(colPrediction, colLabel, colFeatures)) # COMMAND ---------- # MAGIC %md #### Return # COMMAND ---------- dbutils.notebook.exit(run_id)
def read_model_as_spark_bundle(bundle_uri): return PipelineModel.deserializeFromBundle(bundle_uri)
def test_profile_sparkml_pipeline(self): import inspect import os import numpy import pandas import time import pathlib import mleap.pyspark from mleap.pyspark.spark_support import SimpleSparkSerializer from pyspark.ml import PipelineModel # add additional jar files before creating SparkSession this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv') \ .options(header='true', inferschema='true').load(input_path) training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1) label = "income" dtypes = dict(training_data.dtypes) dtypes.pop(label) si_xvars = [] ohe_xvars = [] feature_cols = [] for idx, key in enumerate(dtypes): if dtypes[key] == "string": feature_col = "-".join([key, "encoded"]) feature_cols.append(feature_col) tmp_col = "-".join([key, "tmp"]) si_xvars.append(StringIndexer(inputCol=key, outputCol=tmp_col, handleInvalid="skip")) ohe_xvars.append(OneHotEncoderEstimator(inputCols=[tmp_col], outputCols=[feature_col], dropLast=False)) else: feature_cols.append(key) si_label = StringIndexer(inputCol=label, outputCol='label') assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") lr = LogisticRegression(regParam=0.001) pipeline = Pipeline(stages=si_xvars + ohe_xvars + [si_label, assembler, lr]) # filter out the records which will cause error # use only one record for prediction test_data = test_data.limit(1) # create Spark and Onnx models model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', buildInitialTypesSimple(test_data)) # save Onnx model for runtime usage if model_onnx is None: raise AssertionError("Failed to create the onnx model") model_path = os.path.join("tests", "profile_pipeline_model.onnx") with open(model_path, "wb") as f: f.write(model_onnx.SerializeToString()) # Create MLeap model model_zip_path = os.path.join(this_script_dir, "tests", "mleap-pipeline.zip") if os.path.exists(model_zip_path): os.remove(model_zip_path) model_zip_url = "jar:" + pathlib.Path(model_zip_path).as_uri() # save the pipeline also in MLeap format empty_df = self.spark.createDataFrame([], model.transform(test_data).schema) model.serializeToBundle(model_zip_url, empty_df) mleap_pipeline = PipelineModel.deserializeFromBundle(model_zip_url) spark_times = [] mleap_times = [] runtime_times = [] for i in range(0, 20): data_np = buildInputDictSimple(test_data) # run the model in Spark start = time.time() spark_prediction = model.transform(test_data) end = time.time() spark_times.append(1000 * (end - start)) # run with MLeap start = time.time() mleap_prediction = mleap_pipeline.transform(test_data) end = time.time() mleap_times.append(1000 * (end - start)) if i == 0: # compare only once _compare_mleap_pyspark(mleap_prediction, spark_prediction) # run the model in onnx runtime start = time.time() output, session = run_with_runtime(data_np, model_path) end = time.time() runtime_times.append(1000 * (end - start)) # compare results if i == 0: # compare only once expected = [ spark_prediction.toPandas().label.values.astype(numpy.float32), spark_prediction.toPandas().prediction.values.astype(numpy.float32), spark_prediction.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] _compare_expected(expected, output, session, model_path, decimal=5, onnx_shape=None) gen_plot(spark_times, mleap_times, runtime_times)
from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import StringIndexer from pyspark.ml.feature import VectorAssembler from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.sql import SparkSession from pyspark.ml import PipelineModel # refer to this post for more details: http://stackoverflow.com/questions/38669206/spark-2-0-relative-path-in-absolute-uri-spark-warehouse spark = SparkSession \ .builder \ .appName("MNIST Classifier") \ # .config('spark.sql.warehouse.dir', 'file:///random/path/as/we/need/to/config/this/but/dont/use/it') \ .config('spark.executor.instances', 10) \ .getOrCreate() fileNameTest = './mnist_test.csv' testData = spark.read.csv(fileNameTest, header=True, inferSchema=True) deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:/tmp/pipeline-mnist-classifier-json.zip") result = deserializedPipeline.transform(testData) print("Result: " + str(result)) #testprediction = bestModel.transform(testData) #evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labelIndex", metricName="f1") #print("Precision: " + str(evaluator.evaluate(testprediction)))