def task_7(data_io, train_data, test_data): # ---------------------- Your implementation begins------------------------ dt = DecisionTreeRegressor(labelCol="overall", featuresCol="features", maxDepth=5) model = dt.fit(train_data) predictions = model.transform(test_data) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = {'test_rmse': None} # Modify res: res['test_rmse'] = rmse # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_7') return res
def build_decision_tree_regression(observation_df, feature_columns): # Create new column with all of the features vector_observation_df = create_feature_column(observation_df, feature_columns, ['features', 'duration_sec']) train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3]) lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec") model = lr.fit(train_df) test_predictions = model.transform(test_df) test_predictions.select("prediction", "duration_sec", "features").show(5) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="rmse") print("RMSE on test data = %g" % evaluator.evaluate(test_predictions)) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="r2") print("R2 on test data = %g" % evaluator.evaluate(test_predictions)) return model
def test_decision_tree_regressor(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [100, -10, 50] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeRegressor(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml( model, 'Sparkml Decision Tree Regressor', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressor") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def predict_price_of_unit_area_by_decision_tree( real_estate_dataset_df: DataFrame): """ Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores, using decision tree regression. :param real_estate_dataset_df: :return: """ real_estate_dataset_df = transform_dataset_to_label_feature_form( real_estate_dataset_df) train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5]) train_dataset = train_test_datasets[0] test_dataset = train_test_datasets[1] # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns. decision_tree_regressor = DecisionTreeRegressor().setLabelCol( 'actual_price') model = decision_tree_regressor.fit(train_dataset) # Create predictions for testing dataset. predictions = model.transform(test_dataset).\ select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\ orderBy(func.desc('actual_price')).cache() return predictions
def decision_tree_regression(train_data, test_data): dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV') dt_model = dt.fit(train_data) dt_predictions = dt_model.transform(test_data) dt_evaluator = RegressionEvaluator( labelCol='MEDV', predictionCol='prediction', metricName='rmse', ) rmse = dt_evaluator.evaluate(dt_predictions) print('Root Mean Squared Error (RMSE) on test data = %g' % rmse) print(dt_model.featureImportances)
def task_8(data_io, train_data, test_data): # ---------------------- Your implementation begins------------------------ trainingData, testData = train_data.randomSplit([0.75, 0.25]) best = 0 all_rmse = [] lowest_rmse = 100 for i in [5, 7, 9, 12]: dt = DecisionTreeRegressor(labelCol="overall", featuresCol="features", maxDepth=i) model = dt.fit(trainingData) predictions = model.transform(testData) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) all_rmse = all_rmse + [rmse] if rmse <= lowest_rmse: lowest_rmse = rmse best = i best_model = model predictions = best_model.transform(test_data) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'test_rmse': None, 'valid_rmse_depth_5': None, 'valid_rmse_depth_7': None, 'valid_rmse_depth_9': None, 'valid_rmse_depth_12': None, } # Modify res: res['test_rmse'] = rmse res['valid_rmse_depth_5'] = all_rmse[0] res['valid_rmse_depth_7'] = all_rmse[1] res['valid_rmse_depth_9'] = all_rmse[2] res['valid_rmse_depth_12'] = all_rmse[3] # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_8') return res
def decisionTreeRegressor(data, ncolumns, schemaNames): from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.tuning import CrossValidator from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import Binarizer from pyspark.ml.evaluation import BinaryClassificationEvaluator import numpy as np import time binarizer = Binarizer( threshold=0.00001, inputCol="features", outputCol="binarized_features", ) binarizedDataFrame = binarizer.transform(data) (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50) dtr = DecisionTreeRegressor(labelCol="label", featuresCol="binarized_features", maxDepth=10, maxBins=10, impurity='Variance') timer = '' start = time.time() cvModel = dtr.fit(trainingData) end = time.time() timer = ((end - start) / 60) prediction = cvModel.transform(testData) evaluator = RegressionEvaluator\ (labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(prediction) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") areaUC = evaluator.evaluate(prediction) fi = cvModel.featureImportances imp_feat = np.zeros(ncolumns - 1) imp_feat[fi.indices] = fi.values x = np.arange(ncolumns - 1) idx = (-imp_feat).argsort()[:3] feat = [] for i in idx: feat.append(schemaNames[i]) return feat, rmse, areaUC, timer
def TrainDT(trainingData, testData): # Train a DecisionTree model. dt = DecisionTreeRegressor() # Train model. This also runs the indexer. start = time.time() model = dt.fit(trainingData) end = time.time() print('Training DT model took', end - start) # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2 = evaluator.evaluate(predictions) print("R2 on test data = %g" % r2) # Make predictions for train predictions = model.transform(trainingData) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on train data = %g" % rmse) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2 = evaluator.evaluate(predictions) print("R2 on train data = %g" % r2) return model
def decisionTreeRegression(df,arguments): from pyspark.ml.regression import DecisionTreeRegressor maxDepth = 5 minInstancesPerNode = 1 impurity = "variance" if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.impurity != None: impurity = arguments.impurity dt = DecisionTreeRegressor(maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, impurity=impurity) model = dt.fit(df) return model
def decision_tree_regressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") model = dt.fit(df) model.depth # 1 model.numNodes # 3 model.featureImportances # SparseVector(1, {0: 1.0}) model.numFeatures # 1 test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 1.0 temp_path = "./" dtr_path = temp_path + "/dtr" dt.save(dtr_path) dt2 = DecisionTreeRegressor.load(dtr_path) dt2.getMaxDepth() # 2 model_path = temp_path + "/dtr_model" model.save(model_path) model2 = DecisionTreeRegressionModel.load(model_path) model.numNodes == model2.numNodes # True model.depth == model2.depth # True model.transform(test1).head().variance
elasticNetParam=0.8) lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) trainingSummary = lr_model.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) train_df.describe().show() lr_predictions = lr_model.transform(test_df) lr_predictions.select("prediction", "PE", "features").show(5) lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="PE", metricName="r2") print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions)) ## DecisionTreeRegressor portion from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor(featuresCol='features', labelCol='PE') dt_model = dt.fit(train_df) dt_predictions = dt_model.transform(test_df) dt_evaluator = RegressionEvaluator(labelCol="PE", predictionCol="prediction", metricName="rmse") rmse = dt_evaluator.evaluate(dt_predictions) print( "DecisionTreeRegressor Root Mean Squared Error (RMSE) on test data = %g" % rmse)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="MPG", metricName="r2") print("R Squared (R2) for Linear Regression on test data = %g" % lr_evaluator.evaluate(lr_predictions)) # RMSE on test data test_result = lr_model.evaluate(test_df) print( "Root Mean Squared Error (RMSE) for Linear Regression on test data = %g\n" % test_result.rootMeanSquaredError) #############################---DECISION TREE REGRESSION---################################## dt = DecisionTreeRegressor(featuresCol='features', labelCol='MPG') decisionTree_model = dt.fit(train_df) decisionTree_model_predictions = decisionTree_model.transform(test_df) decisionTree_model_evaluator = RegressionEvaluator( labelCol="MPG", predictionCol="prediction", metricName="rmse") rmse = decisionTree_model_evaluator.evaluate( decisionTree_model_predictions) print( "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" % rmse) r2_dt = ecisionTree_model_evaluator = RegressionEvaluator( labelCol="MPG", predictionCol="prediction", metricName="r2") print("R Squared (R2) for Decision Tree on test data = %g" % r2_dt.evaluate(decisionTree_model_predictions)) ############################---RANDOM FOREST REGRESSION---##################################
def main(): errorsRMSE_LR = [] errorsR2_LR = [] errorsR2_DT = [] errorsR2_DT5 = [] errorsRMSE_DT = [] errorsRMSE_DT5 = [] rows_training = [] rows_testing = [[] for i in range(N_OF_CLUSTERS)] for week_nb in range(FIRST_WEEK, LAST_WEEK + 1): print('week nb : ', week_nb) for day_of_week in range(DAY_IN_WEEK): for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY): for cid in range(N_OF_CLUSTERS): curFeature = demandCache.get_demand( week_nb, day_of_week, time_of_day_code, cid) if curFeature != []: time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature( curFeature) if (week_nb < WEEK_NB_TEST): rows_training.append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount)) else: rows_testing[cid].append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount)) df_training = spark.createDataFrame(rows_training, [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "is_manhattan", "is_airport", "amount" ]) assembler = VectorAssembler(inputCols=[ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "is_manhattan", "is_airport" ], outputCol='features') output_training = assembler.transform(df_training) final_data_training = output_training.select('features', 'amount') decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3) dt_model = decisionTree.fit(final_data_training) #print(dt_model.toDebugString) decisionTree5 = DecisionTreeRegressor(labelCol='amount', maxDepth=5) dt_model5 = decisionTree5.fit(final_data_training) #print(dt_model5.toDebugString) file = open("DT_final_features_one_model_INFO.txt", "w") file.write("DT maxDepth 3 : \n" + dt_model.toDebugString) file.write("DT maxDepth 5 : \n" + dt_model5.toDebugString) file.close() linearRegression = LinearRegression(labelCol='amount') lr_model = linearRegression.fit(final_data_training) for cid in range(N_OF_CLUSTERS): print('cluster: ', cid) df_testing = spark.createDataFrame(rows_testing[cid], [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "is_manhattan", "is_airport", "amount" ]) #df_testing.show() output_testing = assembler.transform(df_testing) final_data_testing = output_testing.select('features', 'amount') predictionsDT = dt_model.transform(final_data_testing) predictionsDT5 = dt_model5.transform(final_data_testing) predictionsLR = lr_model.evaluate(final_data_testing) """ Evaluation rmse : """ rmse = predictionsLR.rootMeanSquaredError errorsRMSE_LR.append(rmse) #print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse) r2 = predictionsLR.r2 errorsR2_LR.append(r2) #print("R Squared Error (R2) for LR on test data = %g" % r2) """ Evaluation rmse : """ evaluatorRMSE = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="rmse") rmse = evaluatorRMSE.evaluate(predictionsDT) rmse5 = evaluatorRMSE.evaluate(predictionsDT5) errorsRMSE_DT.append(rmse) errorsRMSE_DT5.append(rmse5) #print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse) evaluatorR2 = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="r2") r2 = evaluatorR2.evaluate(predictionsDT) r25 = evaluatorR2.evaluate(predictionsDT5) errorsR2_DT.append(r2) errorsR2_DT5.append(r25) #print("R Squared Error (R2) for DT on test data = %g" % r2) return errorsRMSE_LR, errorsR2_LR, errorsRMSE_DT, errorsR2_DT, errorsRMSE_DT5, errorsR2_DT5
#testLFDF.take(10) # COMMAND ---------- #Creating an evaluator measuring our label vs our prediction using RMSE evaluation. evaluator = RegressionEvaluator(metricName="rmse")\ .setLabelCol("price_doc")\ .setPredictionCol("prediction") # COMMAND ---------- #Decision tree regression, testing on both train and test dataset. dt = DecisionTreeRegressor(labelCol='price_doc') #This builds the dt model using the train dataset model = dt.fit(trainLFDF) #This predicts dt model outcomes on train and test dataset trainPredictions = model.transform(trainLFDF) testPredictions = model.transform(testLFDF) trainscore = evaluator.evaluate(trainPredictions) testscore = evaluator.evaluate(testPredictions) print(trainscore, testscore) #DT 8 Vars RMSE 3493522, 3901961 # COMMAND ---------- #Gradient boosted tree regression gbt = GBTRegressor(labelCol='price_doc') model = gbt.fit(trainLFDF)
# COMMAND ---------- # MAGIC %md # MAGIC #### Regression with decision trees # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor().setLabelCol('petalWidth') print dtr.explainParams() # COMMAND ---------- dtrModel = dtr.fit(irisPetal) dtrPredictions = dtrModel.transform(irisPetal) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'}) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'}) # COMMAND ---------- # MAGIC %md # MAGIC Let's also build a gradient boosted tree. # COMMAND ---------- from pyspark.ml.regression import GBTRegressor gbt = GBTRegressor().setLabelCol('petalWidth') print gbt.explainParams()
def spark_process(sqlContext, sc, validate, path_to_file): ###################### # # HDFS to DataFrame # ###################### ## all fields: # ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', # 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', # 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', # 'tolls_amount', 'total_amount'] # columns to select feature_columns = [1,2,3,5,6,9,10] # read file and convert to DataFrame # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache() customSchema = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", StringType(), True), StructField("trip_distance", StringType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("rate_code", StringType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", StringType(), True), StructField("surcharge", StringType(), True), StructField("mta_tax", StringType(), True), StructField("tip_amount", StringType(), True), StructField("tolls_amount", StringType(), True), StructField("total_amount", StringType(), True) ]) dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file) # create dataframe with selected columns dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns)) # this number does not include the header # number_of_trips = dataframe.count() sqlContext.clearCache() ###################### # # Preprocess data # ###################### # filter rows with null fields # if passenger count is missing assign it a value of 1 # filter invalid location: keep only areas near NYC dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \ .fillna(1,subset=["passenger_count"]) \ .filter(dataframe.pickup_latitude>40.0) \ .filter(dataframe.pickup_latitude<41.0) \ .filter(dataframe.pickup_longitude<-73.0) \ .filter(dataframe.pickup_longitude>-74.0) \ .filter(dataframe.dropoff_latitude>40.0) \ .filter(dataframe.dropoff_latitude<41.0) \ .filter(dataframe.dropoff_longitude<-73.0)\ .filter(dataframe.dropoff_longitude>-74.0) ###################### # # features engineering # ###################### # create new column based on time-delta (minutes) # convert pickup-datetime column to hour time_delta_udf = udf(time_delta_minutes,FloatType()) dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \ .withColumn('pick_up_hour', hour(dataframe.pickup_datetime)) dataframe = dataframe.select(dataframe.pick_up_hour, \ dataframe.passenger_count.cast("integer"), \ dataframe.pickup_longitude.cast("double"), \ dataframe.pickup_latitude.cast("double"), \ dataframe.dropoff_longitude.cast("double"),\ dataframe.dropoff_latitude.cast("double"), \ dataframe.time_delta.cast("double")) dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache() # split dataframe into feature and label vector # create feature vectors and labels for model training feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features') transformed = feature_assembler.transform(dataframe) vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache() ###################### # # train model # ###################### if validate: ################################ # # validate model on 60/40 split # ################################ # split training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0) decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(training) train_pred = model.transform(training) test_pred = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_train = evaluator.evaluate(train_pred) evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_test = evaluator_test.evaluate(test_pred) output = test_pred.select("prediction", "label", "features") return output, r2_test, r2_train else: ################### # # train on all data # ################### decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(vector_dataframe) predictions = model.transform(vector_dataframe) output = predictions.select("prediction", "label", "features") ########################### # # process to send to Kafka # ########################### schema = StructType([StructField("prediction_mins", FloatType(), True), StructField("pick_up_hour", IntegerType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True)]) features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect() sqlContext.clearCache() dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache() return dataframe_from_prediction_vector
df = df.selectExpr("fare_amount as label", 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count') new_df = vecAssembler.setHandleInvalid("skip").transform(df) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = new_df.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeRegressor() start_time = datetime.now() # Train model. This also runs the indexer. model = dt.fit(trainingData) time_elapsed = datetime.now() - start_time print('TIME OF DECISION TREE REGRESSION TRAINING (hh:mm:ss.ms) {}'.format( time_elapsed)) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
def spark_process(sqlContext, sc, validate, path_to_file): ###################### # # HDFS to DataFrame # ###################### ## all fields: # ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', # 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', # 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', # 'tolls_amount', 'total_amount'] # columns to select feature_columns = [1, 2, 3, 5, 6, 9, 10] # read file and convert to DataFrame # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache() customSchema = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", StringType(), True), StructField("trip_distance", StringType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("rate_code", StringType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", StringType(), True), StructField("surcharge", StringType(), True), StructField("mta_tax", StringType(), True), StructField("tip_amount", StringType(), True), StructField("tolls_amount", StringType(), True), StructField("total_amount", StringType(), True) ]) dataframe = sqlContext.read.format('com.databricks.spark.csv').options( header='true', schema=customSchema).load(path_to_file) # create dataframe with selected columns dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns)) # this number does not include the header # number_of_trips = dataframe.count() sqlContext.clearCache() ###################### # # Preprocess data # ###################### # filter rows with null fields # if passenger count is missing assign it a value of 1 # filter invalid location: keep only areas near NYC dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \ .fillna(1,subset=["passenger_count"]) \ .filter(dataframe.pickup_latitude>40.0) \ .filter(dataframe.pickup_latitude<41.0) \ .filter(dataframe.pickup_longitude<-73.0) \ .filter(dataframe.pickup_longitude>-74.0) \ .filter(dataframe.dropoff_latitude>40.0) \ .filter(dataframe.dropoff_latitude<41.0) \ .filter(dataframe.dropoff_longitude<-73.0)\ .filter(dataframe.dropoff_longitude>-74.0) ###################### # # features engineering # ###################### # create new column based on time-delta (minutes) # convert pickup-datetime column to hour time_delta_udf = udf(time_delta_minutes, FloatType()) dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \ .withColumn('pick_up_hour', hour(dataframe.pickup_datetime)) dataframe = dataframe.select(dataframe.pick_up_hour, \ dataframe.passenger_count.cast("integer"), \ dataframe.pickup_longitude.cast("double"), \ dataframe.pickup_latitude.cast("double"), \ dataframe.dropoff_longitude.cast("double"),\ dataframe.dropoff_latitude.cast("double"), \ dataframe.time_delta.cast("double")) dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache() # split dataframe into feature and label vector # create feature vectors and labels for model training feature_assembler = VectorAssembler(inputCols=[ 'pick_up_hour', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ], outputCol='features') transformed = feature_assembler.transform(dataframe) vector_dataframe = transformed.select( col("time_delta").alias("label"), col("features")).cache() ###################### # # train model # ###################### if validate: ################################ # # validate model on 60/40 split # ################################ # split training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0) decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25) model = decision_tree_reg.fit(training) train_pred = model.transform(training) test_pred = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_train = evaluator.evaluate(train_pred) evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_test = evaluator_test.evaluate(test_pred) output = test_pred.select("prediction", "label", "features") return output, r2_test, r2_train else: ################### # # train on all data # ################### decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25) model = decision_tree_reg.fit(vector_dataframe) predictions = model.transform(vector_dataframe) output = predictions.select("prediction", "label", "features") ########################### # # process to send to Kafka # ########################### schema = StructType([ StructField("prediction_mins", FloatType(), True), StructField("pick_up_hour", IntegerType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True) ]) features_from_predictions = output.map(lambda row: ( float(row.prediction), int(row.features[0]), float(row.features[ 1]), float(row.features[2]), float(row.features[3]), float(row.features[4]))).collect() sqlContext.clearCache() dataframe_from_prediction_vector = sqlContext.createDataFrame( features_from_predictions, schema).cache() return dataframe_from_prediction_vector
encoder = OneHotEncoder(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] #encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend'] encColumns = [ 'VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type' ] for eCol in encColumns: encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol + "classVec") stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount", "tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map( lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32) model = dt.fit(dataset) model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")
def doGrid_one(): grid_data = getGridData(sqlCtx, '_ngrid2500') errorsRMSE_LR = [] errorsR2_LR = [] errorsRMSE_DT = [] errorsR2_DT = [] hor = grid_data['horizontal_slots'] vert = grid_data['vertical_slots'] print(hor, vert) hor = 24 vert = 24 for x in range(hor): print('grid hor:', x) for y in range(vert): train, test = get_features_for_grid(spark, x, y) assembler = VectorAssembler(inputCols=[ "day", "day_of_week", "hour", "is_airport", "is_manhattan", "minute", 'pickup_lat_slot', 'pickup_long_slot', "time_of_day_code", "week" ], outputCol='features') output_training = assembler.transform(train) output_testing = assembler.transform(test) final_data_training = output_training.select('features', 'amount') final_data_testing = output_testing.select('features', 'amount') final_data_training.describe().show() final_data_testing.describe().show() decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3) dt_model = decisionTree.fit(final_data_training) predictionsDT = dt_model.transform(final_data_testing) print(dt_model.toDebugString) linearRegression = LinearRegression(labelCol='amount') lr_model = linearRegression.fit(final_data_training) predictionsLR = lr_model.evaluate(final_data_testing) """ Evaluation LR : """ rmse = predictionsLR.rootMeanSquaredError errorsRMSE_LR.append(rmse) #print("Root Mean Squared Error (RMSE) for LR on test data = ", rmse) r2 = predictionsLR.r2 errorsR2_LR.append(r2) #print("R Squared Error (R2) for LR on test data = ", r2) """ Evaluation DT : """ evaluatorRMSE = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="rmse") rmse = evaluatorRMSE.evaluate(predictionsDT) errorsRMSE_DT.append(rmse) #print("Root Mean Squared Error (RMSE) for DT on test data = ", rmse) evaluatorR2 = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="r2") r2 = evaluatorR2.evaluate(predictionsDT) errorsR2_DT.append(r2) #print("R Squared Error (R2) for DT on test data = ", r2) return hor, vert, errorsR2_DT, errorsRMSE_DT, errorsRMSE_LR, errorsR2_LR
c) print("\n") print("For the whole dataset, the DecisionTreeRegressor is starting...") evaluator_reg = RegressionEvaluator\ (labelCol="label", predictionCol="prediction", metricName="rmse") print("\n") print( "Fetching the best values of parameters from 25% dataset and using them..." ) rtime = time.time() dtr = DecisionTreeRegressor(labelCol="label", featuresCol="features", maxDepth=maxDepth_dtr, maxBins=maxBins_dtr) model_dtr = dtr.fit(trainingData) predictions_dtr = model_dtr.transform(testData) binarizer = Binarizer(threshold=0.5, inputCol="prediction", outputCol="binarized_prediction") binarizedDataFrame = binarizer.transform(predictions_dtr) binarized = binarizedDataFrame.drop('prediction') bdf_dtr = binarized.withColumnRenamed('binarized_prediction', 'prediction') r = time.time() - rtime evaluator_reg = MulticlassClassificationEvaluator\ (labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy_reg = evaluator_reg.evaluate(bdf_dtr) print("\n") print("Accuracy for DecisionTreeRegressor on the whole dataset = %g " % accuracy_reg) evaluate_area_dtr = BinaryClassificationEvaluator(
def Forecast(df, forecast_days, nLags, \ timeSeriesColumn, regressor, sparksession): # this performs model training # this calls the machine-learning algorithms of Spark ML library #labels for machine-learning LeadWindow = window.Window.rowsBetween(0,forecast_days) df = df.withColumn("label",func.last(df[timeSeriesColumn]).over(LeadWindow)) features = [timeSeriesColumn] #Auto-regression feature LagTransformer = LagGather()\ .setLagLength(nLags)\ .setInputCol(timeSeriesColumn) df = LagTransformer.transform(df) featuresGenerated = LagTransformer.getFeatureNames() features.extend(featuresGenerated) #Other feature generators here: #Moving Average Smoothing #TrendGather #****************************************************************************** # VECTOR ASSEMBLER # this assembles the all the features df = df.dropna() vA = VectorAssembler().setInputCols(features)\ .setOutputCol("features") df_m = vA.transform(df) #****************************************************************************** # Splitting data into train, test splitratio = 0.7 df_train, df_test = TimeSeriesSplit(df_m, splitratio, sparksession) #****************************************************************************** # DECISION-TREE REGRESSOR if(regressor == "DecisionTreeRegression"): dr = DecisionTreeRegressor(featuresCol = "features",\ labelCol = "label", maxDepth = 5) model = dr.fit(df_train) predictions_dr_test = model.transform(df_test) predictions_dr_train = model.transform(df_train) # RMSE is used as evaluation metric evaluator = RegressionEvaluator(predictionCol="prediction",\ labelCol="label",\ metricName ="r2") RMSE_dr_test = evaluator.evaluate(predictions_dr_test) RMSE_dr_train = evaluator.evaluate(predictions_dr_train) return (df_test, df_train, \ predictions_dr_test, predictions_dr_train,\ RMSE_dr_test, RMSE_dr_train) #****************************************************************************** # LINEAR REGRESSOR if(regressor == 'LinearRegression'): lr = LinearRegression(featuresCol = "features", labelCol="label", \ maxIter = 100, regParam = 0.4, \ elasticNetParam = 0.1) model = lr.fit(df_train) predictions_lr_test = model.transform(df_test) predictions_lr_train = model.transform(df_train) # RMSE is used as evaluation metric evaluator = RegressionEvaluator(predictionCol="prediction",\ labelCol="label",\ metricName ="r2") RMSE_lr_test= evaluator.evaluate(predictions_lr_test) RMSE_lr_train = evaluator.evaluate(predictions_lr_train) return (df_test, df_train, \ predictions_lr_test, predictions_lr_train,\ RMSE_lr_test, RMSE_lr_train) #***************************************************************************** # RANDOM FOREST REGRESSOR if(regressor == 'RandomForestRegression'): rfr = RandomForestRegressor(featuresCol="features",\ labelCol="label",\ maxDepth = 5,\ subsamplingRate = 0.8,\ ) model = rfr.fit(df_train) predictions_rfr_test = model.transform(df_test) predictions_rfr_train = model.transform(df_train) # RMSE is used as evaluation metric evaluator = RegressionEvaluator(predictionCol="prediction",\ labelCol="label",\ metricName ="rmse") RMSE_rfr_test= evaluator.evaluate(predictions_rfr_test) RMSE_rfr_train = evaluator.evaluate(predictions_rfr_train) return (df_test, df_train, \ predictions_rfr_test, predictions_rfr_train,\ RMSE_rfr_test, RMSE_rfr_train) #***************************************************************************** # GRADIENT BOOSTING TREE REGRESSOR if(regressor == 'GBTRegression'): gbt = GBTRegressor(featuresCol="features",\ labelCol="label",\ maxDepth=5,\ subsamplingRate=0.8) model = gbt.fit(df_train) predictions_gbt_test = model.transform(df_test) predictions_gbt_train = model.transform(df_train) # RMSE is used as evaluation metric evaluator = RegressionEvaluator(predictionCol="prediction",\ labelCol="label",\ metricName ="rmse") RMSE_gbt_test= evaluator.evaluate(predictions_gbt_test) RMSE_gbt_train = evaluator.evaluate(predictions_gbt_train) return (df_test, df_train, \ predictions_gbt_test, predictions_gbt_train,\ RMSE_gbt_test, RMSE_gbt_train)
trainDF.cache() testDF.cache() # - ##### Entrenar un árbol de regresión para predecir la variable minutos. # In[57]: dt = DecisionTreeRegressor(labelCol='minutos') #toma como inputCo "features" de manera predeterminada # In[58]: model=dt.fit(trainDF) # - ##### Evaluar el modelo resultante usando RMSE tanto en la muestra de entrenamiento como en la muestra de test: Comentar el resultado. # In[59]: predictionDF = model.transform(testDF) # In[23]: evaluator = RegressionEvaluator(labelCol="minutos")
def main(): for cid in range(N_OF_CLUSTERS): rows_training = [] rows_testing = [] for week_nb in range(FIRST_WEEK, LAST_WEEK + 1): print('week nb : ', week_nb) for day_of_week in range(DAY_IN_WEEK): for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY): #for tid in range(TOTAL_SLOTS_FOR_LOOP): #TODO do the loop per week, per day and day slot and change fi_features_cache too curFeature = demandCache.get_demand( week_nb, day_of_week, time_of_day_code, cid) if curFeature != []: time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature( curFeature) if (week_nb < WEEK_NB_TEST): rows_training.append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, amount)) else: rows_testing.append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, amount)) df_training = spark.createDataFrame(rows_training, [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "amount" ]) df_testing = spark.createDataFrame(rows_testing, [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "amount" ]) assembler = VectorAssembler(inputCols=[ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute" ], outputCol='features') output_training = assembler.transform(df_training) output_testing = assembler.transform(df_testing) final_data_training = output_training.select('features', 'amount') final_data_testing = output_testing.select('features', 'amount') decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3) dt_model = decisionTree.fit(final_data_training) predictionsDT = dt_model.transform(final_data_testing) linearRegression = LinearRegression(labelCol='amount') lr_model = linearRegression.fit(final_data_training) predictionsLR = lr_model.evaluate(final_data_testing) # print("Decision tree model max depth = %g" % decisionTree.getMaxDepth()) # print(dt_model.toDebugString) """ Evaluation rmse : """ rmse = predictionsLR.rootMeanSquaredError errorsRMSE_LR.append(rmse) print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse) r2 = predictionsLR.r2 errorsR2_LR.append(r2) print("R Squared Error (R2) for LR on test data = %g" % r2) """ Evaluation rmse : """ evaluatorRMSE = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="rmse") rmse = evaluatorRMSE.evaluate(predictionsDT) errorsRMSE_DT.append(rmse) print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse) evaluatorR2 = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="r2") r2 = evaluatorR2.evaluate(predictionsDT) errorsR2_DT.append(r2) print("R Squared Error (R2) for DT on test data = %g" % r2)
def main(): for cid in range(N_OF_CLUSTERS): rows_training = [] rows_testing = [] for week_nb in range(FIRST_WEEK, LAST_WEEK + 1): print('week nb : ', week_nb) for day_of_week in range(DAY_IN_WEEK): for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY): #retrieving the features'values: curFeature = demandCache.get_demand( week_nb, day_of_week, time_of_day_code, cid) if curFeature != []: time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature( curFeature) # Checking whether the current row should be added to the training or testing set: if (week_nb < WEEK_NB_TEST): rows_training.append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, amount)) else: rows_testing.append( (time_of_day_code, origin, day_of_week, day, week, hour, minute, amount)) # Creating the dataframe for the model containing all the rows : df_training = spark.createDataFrame(rows_training, [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "amount" ]) df_testing = spark.createDataFrame(rows_testing, [ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute", "amount" ]) assembler = VectorAssembler(inputCols=[ "time_of_day_code", "origin", "day_of_week", "day", "week", "hour", "minute" ], outputCol='features') output_training = assembler.transform(df_training) output_testing = assembler.transform(df_testing) final_data_training = output_training.select('features', 'amount') final_data_testing = output_testing.select('features', 'amount') # Training the Desition Tree: decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3) dt_model = decisionTree.fit(final_data_training) predictionsDT = dt_model.transform(final_data_testing) # print(dt_model.toDebugString) # showing the decision tree # Training the linear regression: linearRegression = LinearRegression(labelCol='amount') lr_model = linearRegression.fit(final_data_training) predictionsLR = lr_model.evaluate(final_data_testing) """ Evaluation rmse : """ rmse = predictionsLR.rootMeanSquaredError errorsRMSE_LR.append(rmse) print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse) r2 = predictionsLR.r2 errorsR2_LR.append(r2) print("R Squared Error (R2) for LR on test data = %g" % r2) """ Evaluation rmse : """ evaluatorRMSE = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="rmse") rmse = evaluatorRMSE.evaluate(predictionsDT) errorsRMSE_DT.append(rmse) print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse) evaluatorR2 = RegressionEvaluator(labelCol="amount", predictionCol="prediction", metricName="r2") r2 = evaluatorR2.evaluate(predictionsDT) errorsR2_DT.append(r2) print("R Squared Error (R2) for DT on test data = %g" % r2)
glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams() glrModel = glr.fit(df) # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor() print rf.explainParams() rfModel = rf.fit(df) gbt = GBTRegressor() print gbt.explainParams() gbtModel = gbt.fit(df) # COMMAND ----------
# Split the data 70-30 train_test_data = model_data.randomSplit([0.8, 0.2], 16430212) train_data = train_test_data[0] test_data = train_test_data[1] print("Train DT") rmseEvaluator = myRmseEvaluator( RegressionEvaluator(predictionCol="prediction", labelCol="trip_duration", metricName="rmse")) maeEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="trip_duration", metricName="mae") dtr = DecisionTreeRegressor( maxDepth=3).setFeaturesCol("features").setLabelCol("trip_duration") trained_model = dtr.fit(train_data) predictions = trained_model.transform(test_data) # final_result = predictions.select("prediction", "trip_duration").rdd print(trained_model) print("RMSE for Regression Tree:", rmseEvaluator.evaluate(predictions)) print("MAE for Regression Tree:", maeEvaluator.evaluate(predictions)) """ DecisionTreeRegressionModel: uid=DecisionTreeRegressor_826b1c042824, depth=3, numNodes=15, numFeatures=7 If (feature 6 <= 2.7002733639393384) If (feature 6 <= 1.3071311166631614) If (feature 6 <= 0.825910208978972) Predict: 481.0347939172201 Else (feature 6 > 0.825910208978972) Predict: 704.5021037177617 Else (feature 6 > 1.3071311166631614)
categoricalColumns = ['store_and_fwd_flag'] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] #encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend'] encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type'] for eCol in encColumns: encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec") stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32) model = dt.fit(dataset) model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")
from pyspark.ml.regression import DecisionTreeRegressor dt_models = {} dt_predictions = {} compute_again = False if compute_again == False: dt_models = loadModels("TreeModel_","tree") for park in park_data_with_date_dict: dt_predictions[park] = dt_models[park].transform(test_ds[park]) else: for park in park_data_with_date_dict: #vectorAssembler = VectorAssembler(inputCols=features, outputCol="features") #data = vectorAssembler.transform(all_tables[park]) #train, test = data.randomSplit([0.8,0.2], seed = 12345) dt = DecisionTreeRegressor() dt_models[park] = dt.fit(train_ds[park]) dt_predictions[park] = dt_models[park].transform(test_ds[park]) saveModels(dt_models,"TreeModel_","tree") # COMMAND ---------- #ATTENZIONE: se vuoi visualizzare proprio gli alberi puoi chiamare display(dt_models[park]) # COMMAND ---------- def printEvaluateModel(park,modelsCollection, predictionsCollection): print("EVALUATE MODEL FOR PARKING "+str(park)) print("OVER TEST SET") print("Features importance:" + str(modelsCollection[park].featureImportances)) eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") # Root Mean Square Error
import numpy as np import matplotlib.pyplot as plt # In[106]: ### model building process #create a sample for model test sample, x = final_data.randomSplit([0.1, 0.8]) # In[107]: # decision trees r2_dtr = np.zeros(10) for i in np.arange(10): dtr = DecisionTreeRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.) dtrModel = dtr.fit(sample) prediction_dtr = dtrModel.transform(sample) r2_dtr[i] = evaluator.evaluate(prediction_dtr) plt.plot(np.arange(3, 33, 3), r2_dtr) # so choose 10 as the maxDepth # In[108]: # Random Forest r2_rfr = np.zeros(10) for i in np.arange(10): rfr = RandomForestRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.) rfrModel = rfr.fit(sample) prediction_rfr = rfrModel.transform(sample) r2_rfr[i] = evaluator.evaluate(prediction_rfr) plt.plot(np.arange(3, 33, 3), r2_rfr)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame if self._mlEnv == "spark": pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(dtreer.regParam, [0.1, 0.01]) \ .addGrid(dtreer.fitIntercept, [False, True])\ .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\ .build() crossval = CrossValidator(estimator=dtreer, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = dtreer.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit featureImportance = bestModel.featureImportances print featureImportance,type(featureImportance) # print featureImportance[0],len(featureImportance[1],len(featureImportance[2])) print len(featureMapping) featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping] print featuresArray MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("dtree Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_feature_importance(featureImportance) # print CommonUtils.convert_python_object_to_json(self._model_summary) elif self._mlEnv == "sklearn": model_filepath = model_path+"/"+self._slug+"/model.pkl" x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data() x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column) st = time.time() est = DecisionTreeRegressor() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()} print params_grid if hyperParamAlgoName == "gridsearchcv": estGrid = GridSearchCV(est,params_grid) gridParams = estGrid.get_params() hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams} estGrid.set_params(**hyperParamInitParam) estGrid.fit(x_train,y_train) bestEstimator = estGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models() self._result_setter.set_hyper_parameter_results(self._slug,resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()}) elif hyperParamAlgoName == "randomsearchcv": estRand = RandomizedSearchCV(est,params_grid) estRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] algoParams = algoSetting.get_params_dict() algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()} est.set_params(**algoParams) self._result_setter.set_hyper_parameter_results(self._slug,None) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": est.fit(x_train, y_train) bestEstimator = est trainingTime = time.time()-st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0]*len(y_score) featureImportance={} objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}} featureImportance = objs["trained_model"].feature_importances_ featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName+".pkl") joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["mse"] = mean_squared_error(y_test, y_score) metrics["mae"] = mean_absolute_error(y_test, y_score) metrics["rmse"] = sqrt(metrics["mse"]) transformed = pd.DataFrame({"prediction":y_score,result_column:y_test}) transformed["difference"] = transformed[result_column] - transformed["prediction"] transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column] sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100,random_state=420) else: sampleData = transformed print sampleData.head() mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items() mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))] predictionColSummary = transformed["prediction"].describe().to_dict() quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]] print quantileBins quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index() quantileDf.columns = ["prediction","sum","mean","count"] print quantileDf quantileArr = quantileDf.T.to_dict().items() quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr] print quantileSummaryArr runtime = round((time.time() - st_global),2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("DTREE Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) try: pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([ ("pretrained-estimator", objs["trained_model"]) ]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) pmmlfile = open(pmml_filepath,"r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug:pmmlText}) except: pass if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":self._model_summary.get_model_accuracy(), "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":modelName } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":resultArray[0]["R-Squared"], "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in dtreerCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_dtree_regression_model_summart(modelSummaryJson) self._result_setter.set_dtreer_cards(dtreerCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
assembler = VectorAssembler(inputCols=[ "slot_id", "day_of_week", "day_of_month", "week_nb", "hour", "minute" ], outputCol='features') output_training = assembler.transform(df_training) output_testing = assembler.transform(df_testing) final_data_training = output_training.select('features', 'demand') final_data_testing = output_testing.select('features', 'demand') #final_data_training.describe().show() #final_data_testing.describe().show() """ Model and predictions : """ decisionTree = DecisionTreeRegressor(labelCol='demand', maxDepth=3) dt_model = decisionTree.fit(final_data_training) predictions = dt_model.transform(final_data_testing) #print("Decision tree model max depth = %g" % decisionTree.getMaxDepth()) #print(dt_model.toDebugString) """ Evaluation rmse : """ evaluatorRMSE = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="rmse") rmse = evaluatorRMSE.evaluate(predictions) errorsRMSE.append(rmse) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) evaluatorR2 = RegressionEvaluator(labelCol="demand", predictionCol="prediction", metricName="r2") r2 = evaluatorR2.evaluate(predictions)
assembler = VectorAssembler().setInputCols( ['HouseAge', 'DistanceToMRT', 'NumberConvenienceStores']).setOutputCol('features') df = assembler.transform(data).select('PriceOfUnitArea', 'features') # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model dtr = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol( 'PriceOfUnitArea') # Train the model using our training data model = dtr.fit(trainingDF) # Now see if we can predict values in our test data. # Generate predictions using our linear regression model for all features in our # test dataframe: fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect() # Print out the predicted and actual values for each point for prediction in predictionAndLabel: