def randomSplitByUser(df, weights, seed=None): trainingRation = weights[0] fractions = {row['user_id']: trainingRation for row in df.select('user_id').distinct().collect()} training = df.sampleBy('user_id', fractions, seed) testRDD = df.rdd.subtract(training.rdd) test = spark.createDataFrame(testRDD, df.schema) return (training, test)
def pred(): df = jsntodf(request.json) df = spark.createDataFrame(df) model_path_dir = "mlruns/0/c1a2ff5b99c246f19ee6e2df4d1406fe/artifacts/spark-model" model = mlflow.spark.load_model(model_path_dir) prd = model.transform(df) dfp = prd.select("lib1", "credit", "debit", "predictedLabel") df = dfp.toPandas() df = df.to_json(orient='records') #result={} #for index, row in df.iterrows(): # result[index] = row.to_json() # result[index] = dict(row) return json.jsonify(df)
def sur_echant(df, p): counts = df.groupBy('lib4').count().collect() categories = [i[0] for i in counts] values = [i[1] for i in counts] max_v = max(values) indx = values.index(max_v) dic = {j: 1 for i, j in enumerate(categories) if i != indx} dic[categories[indx]] = p df = df.sampleBy("lib4", fractions=dic) p = int(max_v * p) for i, cat in enumerate(categories): if i != indx: a = df data = a.sampleBy("lib4", fractions={cat: 1}).toPandas().sample(p - values[i], replace=True) spark_df = spark.createDataFrame(data) df = df.union(spark_df) return df
model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, inference_date) from pyspark.sql.types import StringType, DoubleType, StructType, StructField runs_df_schema = StructType([ \ StructField("run_id",StringType(),True), \ StructField("r2",DoubleType(),True), \ StructField("rmse",DoubleType(),True), \ ]) runs_df_data = [] for mv in client.search_model_versions(f"name='{model_name}'"): pprint(mv) run_id = dict(mv)["run_id"] run = client.get_run(run_id) # pprint(run) print(run.data.metrics) runs_df_data.append( (run_id, run.data.metrics["r2"], run.data.metrics["rmse"])) # metrics = ["metrics"] print("~~~~~~~~~~~~~~~~") print("$$$$$$$$$$$$$$$$$$$$$$") runs_df = spark.createDataFrame(data=runs_df_data, schema=runs_df_schema) display(runs_df) # COMMAND ---------- # https://stackoverflow.com/questions/40661859/getting-the-first-value-from-spark-sql-row best_run_id = runs_df.sort("rmse", "r2").take(1)[0][0] best_run_id
def create_spark_dataframes(train_data, test_data): print('----------Preparing spark dataframes----------') train_df = spark.createDataFrame(train_data.tolist(), schema=schema) test_df = spark.createDataFrame(test_data.tolist(), schema=schema) return train_df.select('price', *features), test_df.select('price', *features)
# MAGIC ## Evaluate Model # COMMAND ---------- predictions.createOrReplaceTempView("predictions") # COMMAND ---------- # MAGIC %sql SELECT avg(CASE WHEN prediction = label THEN 1.0 ELSE 0.0 END) AS accuracy FROM predictions # COMMAND ---------- bestModel = pipelineTrained.stages[-1:][0].bestModel # convert numpy.float64 to str for spark.createDataFrame() weights = map(lambda w: '%.10f' % w, bestModel.featureImportances) weightedFeatures = spark.createDataFrame(sorted(zip(weights, featureCols), key=lambda x: x[1], reverse=True)).toDF( "weight", "feature") weightedFeatures.select("feature", "weight").orderBy("weight", ascending=False).display() # COMMAND ---------- # MAGIC %md ## Saving our model to MLFLow registry # COMMAND ---------- # DBTITLE 1,Save our new model to the registry as a new version # get the best model having the best metrics.AUROC from the registry best_models = ( mlflow .search_runs(filter_string='tags.model="turbine_gbt" and attributes.status = "FINISHED" and metrics.AUROC > 0') .sort_values(by=['metrics.AUROC'], ascending=False) )
print("AUC train:", auc_train) print("PR test:", pr_test) print("AUC test:", auc_test) # COMMAND ---------- # MAGIC %md ### Confusion Matrix Code-base # MAGIC Subsequent cells will be using the following code to plot the confusion matrix. # COMMAND ---------- # Create confusion matrix template from pyspark.sql.functions import lit, expr, col, column # Confusion matrix template cmt = spark.createDataFrame([(1, 0), (0, 0), (1, 1), (0, 1)], ["label", "prediction"]) cmt.createOrReplaceTempView("cmt") # COMMAND ---------- # Source code for plotting confusion matrix is based on `plot_confusion_matrix` # via https://runawayhorse001.github.io/LearningApacheSpark/classification.html#decision-tree-classification import matplotlib.pyplot as plt import numpy as np import itertools def plot_confusion_matrix(cm, title): # Clear Plot plt.gcf().clear()
# MAGIC # MAGIC Colors correspond to years, which was selected as a feature that most strongly interacts with mortality rate. It's also not surprising that in later years (red), mortality rate is lower and thus life expectancy higher. There is a mild secondary trend here, seen if comparing the curve of blue points (longer go) to red point (more recent). Predicted life expectancy, it might be said, varies less with this mortality rate recently than in the past. # COMMAND ---------- # MAGIC %md # MAGIC The United States stood out as an outlier in the life expectancy plot above. We might instead ask, how is the USA different relative to other countries. SHAP can help explain how features explain predicted life expectancy differently. # COMMAND ---------- us_delta = shap_values[X['Country_USA']].mean( axis=0) - shap_values[~X['Country_USA']].mean(axis=0) importances = list(zip([float(f) for f in us_delta], display_cols)) top_importances = sorted(importances, key=lambda p: abs(p[0]), reverse=True)[:10] display(spark.createDataFrame(top_importances, ["Mean SHAP delta", "Feature"])) # COMMAND ---------- # MAGIC %md # MAGIC Mortality rate due to cardiac disease, diabetes and cancer stands out in the USA. On average, it explains almost a year less life expectancy than in other countris. # MAGIC # MAGIC This model can now be moved to Production, for consumption and deployment for inference: # COMMAND ---------- client.transition_model_version_stage(model_name, latest_model_detail.version, stage="Production") # COMMAND ----------
# MAGIC Open SQL Analytics and query away! Some ideas: # MAGIC https://e2-demo-west.cloud.databricks.com/sql/dashboards/92d8ccfa-10bb-411c-b410-274b64b25520-turbine-demo-predictions?o=2556758628403379 # COMMAND ---------- # MAGIC %md # MAGIC ## Model Explainability # MAGIC Our Spark model comes with a basic feature importance metric we can use to have a first understanding of our model: # COMMAND ---------- bestModel = pipelineTrained.stages[-1:][0].bestModel # Convert numpy.float64 to str for spark.createDataFrame() weights = map(lambda w: '%.10f' % w, bestModel.featureImportances) weightedFeatures = spark.createDataFrame(sorted(zip(weights, featureCols), key=lambda x: x[1], reverse=True)).toDF("weight", "feature") display(weightedFeatures.select("feature", "weight").orderBy("weight", ascending=False)) # COMMAND ---------- # MAGIC %md #### Explaining our model with SHAP # MAGIC Our model feature importance are is limited (we can't explain a single prediction) and can lead to a surprising result. # MAGIC # MAGIC You can have a look to [Scott Lundberg blog post](https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27) for more details. # MAGIC # MAGIC Using `shap`, we can understand how our model is behaving for a specific row. Let's analyze the importance of each feature for the first row of our dataset. # COMMAND ---------- import shap import numpy as np
model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on) runs_df_schema = StructType([ \ StructField("run_id",StringType(),True), \ StructField("version",StringType(),True), \ StructField("r2",DoubleType(),True), \ StructField("rmse",DoubleType(),True), \ ]) runs_df_data = [] for mv in client.search_model_versions(f"name='{model_name}'"): run_id = dict(mv)["run_id"] run_version = dict(mv)["version"] run = client.get_run(run_id) runs_df_data.append((run_id, run_version, run.data.metrics["r2"], run.data.metrics["rmse"])) runs_df = spark.createDataFrame(data=runs_df_data,schema=runs_df_schema) runs_df = runs_df.sort("rmse", "r2") display(runs_df) # https://stackoverflow.com/questions/40661859/getting-the-first-value-from-spark-sql-row best_run = runs_df.take(1)[0] best_run_version.append(best_run[1]) second_best_run = runs_df.take(2)[1] second_best_run_version.append(second_best_run[1]) # COMMAND ---------- for i, train_on in enumerate(["DEP", "ARR"]): # archive the current production version model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on) for mv in client.search_model_versions(f"name='{model_name}'"): if dict(mv)['current_stage'] == 'Production':
import mlflow.spark import pyspark from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer if __name__ == '__main__': spark = pyspark.sql.SparkSession.builder.getOrCreate() training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "mlflow spark integration", 1.0), (5, "try spark udf", 1.0)], [str(1), str(2), "label"]) tokenizer = Tokenizer(inputCol=str(2), outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) mlflow.set_experiment("v0.8.1-udf") with mlflow.start_run(): mlflow.spark.log_model(model, "spark-model") mlflow.log_param("keyword", "spark") mlflow.log_param("keys", training.count()) print("Done Running Experiment v0.8.1-udf")