modelURI # COMMAND ---------- # MAGIC %md # MAGIC ### Step 2: Load our model and apply predictions # MAGIC *Add in some guiding Text* # COMMAND ---------- import mlflow.spark # COMMAND ---------- ## Why does this take 5 minutes??? spark_model = mlflow.spark.load_model(modelURI) # COMMAND ---------- df = spark.sql("select * from max_db.bank_marketing_train_set") # COMMAND ---------- resultDF = spark_model.transform(df.drop("label")) # COMMAND ---------- display(resultDF.drop("features", "rawPrediction")) # COMMAND ----------
model_stage = "Staging" # move the model to the appropriate stage. client.transition_model_version_stage(name=model_name, version=model_version, stage=model_stage) predicted_inference_DF = pipelineModel.transform(df_inference) # the idea now is to return the predicted delay for each model version and save these things in a table such as the one in notebook 06 RandomForest with Time & Weather. return predicted_inference_DF # COMMAND ---------- inputs = spark.sql(""" SELECT * FROM bronze_air_traffic_cleaned_v3 WHERE ORIGIN IN ("JFK","SEA","BOS","ATL","LAX","SFO","DEN","DFW","ORD","CVG","CLT","DCA","IAH") AND DEST IN ("JFK","SEA","BOS","ATL","LAX","SFO","DEN","DFW","ORD","CVG","CLT","DCA","IAH") """) # COMMAND ---------- display(inputs) # COMMAND ---------- predicted_inference_DF = train_model(inputs, 6, 50) # COMMAND ---------- predicted_inference_DF.select("SCHEDULED_DEP_TIME", "ARR_DELAY", "prediction").display()
maxIter = int(sys.argv[2]) regParam = float(sys.argv[3]) elasticNetParam = float(sys.argv[4]) log_param("modelType", modelType) log_param("maxIter", maxIter) log_param("regParam", regParam) log_param("elasticNetParam", elasticNetParam) spark = SparkSession \ .builder \ .appName("Python Spark MLFlow basic example") \ .enableHiveSupport() \ .getOrCreate() df_102 = spark.sql("SELECT * from default.nyc_trips_final_102").na.drop() df_102 = df_102.withColumnRenamed("fare_amt", "label") df_102 = df_102.withColumn("day_of_week_new", df_102.day_of_week.cast("int")) paymentIndexer = StringIndexer( inputCol="payment_type", outputCol="payment_indexed").setHandleInvalid("skip") vendorIndexer = StringIndexer( inputCol="vendor_name", outputCol="vendor_indexed").setHandleInvalid("skip") assembler = VectorAssembler(inputCols=[ "passenger_count", "trip_distance", "hour", "day_of_week_new", "start_cluster", "payment_indexed", "vendor_indexed" ],
dbutils.fs.ls("/mnt/%s" % mount_name) except: print("bucket isn't mounted, mount the demo bucket under %s" % mount_name) dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name) # COMMAND ---------- current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext( ).tags().apply('user') dbName = re.sub(r'\W+', '_', current_user) path = "/Users/{}/demo".format(current_user) dbutils.widgets.text("path", path, "path") dbutils.widgets.text("dbName", dbName, "dbName") print("using path {}".format(path)) spark.sql( """create database if not exists {} LOCATION '{}/global_demo/tables' """. format(dbName, path)) spark.sql("""USE {}""".format(dbName)) # COMMAND ---------- tables = [ "turbine_bronze", "turbine_silver", "turbine_gold", "turbine_power", "turbine_schema_evolution" ] reset_all = dbutils.widgets.get("reset_all_data") == "true" or any([ not spark.catalog._jcatalog.tableExists(table) for table in ["turbine_power"] ]) if reset_all: print("resetting data")
# COMMAND ---------- # MAGIC %sql # MAGIC select * from tempFlightsWeatherB # COMMAND ---------- dff2 = spark.table('tempFlightsWeatherB') dff2 = dff2.withColumnRenamed('prcp', 'dest_prcp') dff2 = dff2.drop('date') dff2 = dff2.drop('iata') # COMMAND ---------- display(dff2) # COMMAND ---------- dff2.write.format("delta").save('/mnt/delta/clemens/airaugmented') spark.sql( "create table clemens.flightdelays_augmented using delta location '/mnt/delta/clemens/airaugmented'" ) # COMMAND ---------- # MAGIC %sql # MAGIC select * from clemens.flightdelays_augmented # COMMAND ----------
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import VectorIndexer from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit import mlflow from mlflow import spark import mlflow.mleap #fonctionne avec Spark 3.0 ? #import mlflow.pyfunc import mleap.pyspark # COMMAND ---------- #df = spark.sql("SELECT trip_duration,start_station_id,birth_year,unknown_gender,male_gender,female_gender,Subscriber,Customer,real_distance,((real_distance / trip_duration)* 3.6) as vitesse, DATE(start_time) as date,HOUR(start_time) as hour FROM CitibikeNY NATURAL JOIN citybike_station_distance") df = spark.sql( "SELECT trip_duration,start_station_id,birth_year,unknown_gender,male_gender,female_gender,Subscriber,Customer,distance_bwn_stations,(((distance_bwn_stations * 1000) / trip_duration)* 3.6) as vitesse, DATE(start_time) as date,HOUR(start_time) as hour FROM CitibikeNY2 NATURAL JOIN citybike_station_distance" ) # COMMAND ---------- df = df.filter((df.vitesse > 13) & (df.vitesse < 32)) # COMMAND ---------- df = spark.sql("SELECT * FROM tab_nycitibike") # COMMAND ---------- display(df) # COMMAND ----------
# the below experient id needs subscription mlflow_experiment_id = 0 # Including MLflow import mlflow import mlflow.spark import os print("MLflow Version: %s" % mlflow.__version__) # COMMAND ---------- # Create df DataFrame which contains our simulated financial fraud detection dataset df = spark.sql( "select step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest, isFraud from sim_fin_fraud_detection" ) # COMMAND ---------- # Review the schema of your data df.printSchema() # COMMAND ---------- # MAGIC %md # MAGIC ### Calculate Differences between Originating and Destination Balanaces # MAGIC With the following PySpark DataFrame query, we will calculate the following columns: # MAGIC # MAGIC | New Column | Definition | # MAGIC | ---------- | ---------- |
# COMMAND ---------- # MAGIC %md # MAGIC ## Step 1: Upload and Read Sensor Dataset # MAGIC # MAGIC For the training dataset, you will need to upload some data to the Databricks File System (DBFS). Go to File > Upload Data and click "Browse" in the middle box to bring up your file explorer for your local computer. Navigate to the place where you downloaded the artifacts for this workshop and go into the `/Datasets` folder and choose `sensordata.csv`. Once you see a green checkmark, then you just need to press **Next** and then **Done** on the next screen. # COMMAND ---------- # MAGIC %md # MAGIC Here we will be creating a database to store some of the tables that we will create during this workshop. The first table will be a Delta Lake table that will hold our uploaded sensor data. # COMMAND ---------- MODEL_PROJECT_NAME = dbutils.widgets.get("model_project_name") spark.sql(f"CREATE DATABASE IF NOT EXISTS {MODEL_PROJECT_NAME}") username = spark.sql("SELECT current_user()").collect()[0][0] sensorData = spark.read.csv( f"dbfs:/FileStore/shared_uploads/{username}/sensordata.csv", header=True, inferSchema=True) sensorData.write.saveAsTable(f"{DB_NAME}.sensor", format="delta", mode="overwrite") dataDf = spark.table("sensor").where(col('Device') == 'Device001') # COMMAND ---------- # MAGIC %md # MAGIC With our sensor data table saved, we can create an MLFlow Experiment to house the metrics that we log during our training runs. # MAGIC
import matplotlib.pyplot as plt import pyspark.sql.functions as sqlf from pyspark.sql.functions import col, to_date # COMMAND ---------- # Load Departure Dataframe #silverArrDF_2 = spark.sql(""" # SELECT * # FROM {}.silverarr_delta # WHERE FL_DATE # """.format(GROUP_DBNAME)) silverArrDF_2 = spark.sql(""" SELECT * FROM {0}.silverdep_delta WHERE FL_DATE BETWEEN '{1}' AND '{2}' """.format(GROUP_DBNAME, training_start_date, training_end_date)) # Clean Arrival Data silverArrDF_2 = silverArrDF_2.filter(col('ARR_DELAY').isNotNull()) silverArrDF_2 = silverArrDF_2.filter(col('DEP_DELAY').isNotNull()) silverArrDF_2 = silverArrDF_2.drop('FL_DATE') silverArrDF_2 = drop_null_columns(silverArrDF_2) #Transform into Pandas Dataframe silverArrDF_2 = silverArrDF_2.toPandas() display(silverArrDF_2.head()) # COMMAND ----------
def getProdModelURI(modelRegistryName): models = client.search_model_versions("name='%s'" % modelRegistryName) source = [model for model in models if model.current_stage == "Production"][0].source return source modelURI = getProdModelURI(modelRegistryName) # COMMAND ---------- # MAGIC %md # MAGIC ### Step 2: Load our model and apply predictions # MAGIC We can use the MLFlow spark flavor to load the LightGBM model. This may sound counterintuitive, however it works because mml spark LightGBM just uses Spark under the hood # COMMAND ---------- import mlflow.lightgbm import mlflow.spark from mmlspark import LightGBMClassifier # COMMAND ---------- LGB_model = mlflow.spark.load_model(modelURI) # COMMAND ---------- df = spark.sql("select * from global_temp.globalTempTestData") resultDF = LGB_model.transform(df) # COMMAND ---------- display(resultDF)
# MAGIC from training_rwd.patient_encounters # MAGIC group by encounterclass # MAGIC order by count desc # COMMAND ---------- pt_encounters = sql('select * from patient_encounters') # COMMAND ---------- encounters.describe('cost').show() # COMMAND ---------- # get the list of patients with the target condition (cases) condition_patients = spark.sql("SELECT DISTINCT PATIENT FROM training_rwd.patient_encounters WHERE lower(REASONDESCRIPTION) LIKE '%" + dbutils.widgets.get('condition') + "%'") # COMMAND ---------- # DBTITLE 1,List of patients with the condition to model condition_patients = (encounters .where(lower(encounters.REASONDESCRIPTION).contains(dbutils.widgets.get('condition'))) .select('PATIENT').dropDuplicates() ) # COMMAND ---------- condition_patients.count() # COMMAND ----------
# MAGIC * `mlflow.log_metric` tells MLflow to track a particular variable as a *metric* of the run # MAGIC * `mlflow.<flavor>.log_model` (optional) tells MLflow to log a model including it's dependencies # MAGIC * `mlflow.log_artifact` (optional) tells MLflow to log a file from local disk (ie. image, config file, dataset, etc.) # COMMAND ---------- import mlflow import mlflow.mleap import mlflow.spark from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.regression import LinearRegression from pyspark.ml import Pipeline # Pull our data into a Spark dataframe df = spark.sql("select * from sensor_readings") # Extract the columns that we want in our feature vector featureColumns = df.drop("timestamp", "Sensor-Predict").columns def trainLRModel(data, maxIter, regParam, elasticNetParam): def evalMetrics(summary): rmse = summary.rootMeanSquaredError r2 = summary.r2 return (rmse, r2) with mlflow.start_run() as run: # Split our dataset into training and testing (train, test) = df.randomSplit([0.7, 0.3])
import pandas as pd from pyspark.ml.classification import LogisticRegression from pyspark.sql import SQLContext from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, IndexToString, VectorIndexer from pyspark.ml.linalg import Vector from pyspark.ml.evaluation import MulticlassClassificationEvaluator spark = SparkSession.builder.appName('Project').getOrCreate() dataset = spark.read.csv("reviews.tbl", inferSchema=True, header=True, sep='|') dataset.createTempView("product_reviews") q = "SELECT CASE pr_rating WHEN 1 THEN '0' WHEN 2 THEN '0' WHEN 3 THEN '1' WHEN 4 THEN '3' WHEN 5 THEN '3' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)" df = spark.sql(q).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="userFeatures") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) df = rescaledData.select(rescaledData["label"].cast("double"), (rescaledData["userFeatures"])) from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=["userFeatures"], outputCol="features")
dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name) # COMMAND ---------- # DBTITLE 1,Create User-Specific database current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext( ).tags().apply('user') print("Created variables:") print("current_user: {}".format(current_user)) dbName = re.sub(r'\W+', '_', current_user) path = "/Users/{}/demo".format(current_user) dbutils.widgets.text("path", path, "path") dbutils.widgets.text("dbName", dbName, "dbName") print("path (default path): {}".format(path)) spark.sql( """create database if not exists {} LOCATION '{}/global_demo/tables' """. format(dbName, path)) spark.sql("""USE {}""".format(dbName)) print("dbName (using database): {}".format(dbName)) # COMMAND ---------- # DBTITLE 1,Reset tables in user's database tables = [ "turbine_bronze", "turbine_silver", "turbine_gold", "turbine_power", "turbine_schema_evolution" ] reset_all = dbutils.widgets.get("reset_all_data") == "true" or any([ not spark.catalog._jcatalog.tableExists(table) for table in ["turbine_power"] ])