def build_decision_tree_regression(observation_df, feature_columns): # Create new column with all of the features vector_observation_df = create_feature_column(observation_df, feature_columns, ['features', 'duration_sec']) train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3]) lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec") model = lr.fit(train_df) test_predictions = model.transform(test_df) test_predictions.select("prediction", "duration_sec", "features").show(5) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="rmse") print("RMSE on test data = %g" % evaluator.evaluate(test_predictions)) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="r2") print("R2 on test data = %g" % evaluator.evaluate(test_predictions)) return model
def predict_price_of_unit_area_by_decision_tree( real_estate_dataset_df: DataFrame): """ Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores, using decision tree regression. :param real_estate_dataset_df: :return: """ real_estate_dataset_df = transform_dataset_to_label_feature_form( real_estate_dataset_df) train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5]) train_dataset = train_test_datasets[0] test_dataset = train_test_datasets[1] # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns. decision_tree_regressor = DecisionTreeRegressor().setLabelCol( 'actual_price') model = decision_tree_regressor.fit(train_dataset) # Create predictions for testing dataset. predictions = model.transform(test_dataset).\ select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\ orderBy(func.desc('actual_price')).cache() return predictions
def dtRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : decisiontree_regression model [model] """ featuresCol = conf["params"].get("featuresCol") impurity = conf["params"].get("impurity", "variance") maxDepth = conf["params"].get("maxDepth", 5) maxBin = conf["params"].get("maxBins",32) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB",256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) checkpointInterval = conf["params"].get("checkpointInterval", 10) seed = conf["params"].get("seed", None) varianceCol = conf["params"].get("varianceCol", None) dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol) pipeline = Pipeline(stages=[featureIndexer, dt]) print ("maxDepth : " , dt.getMaxDepth()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = pipeline.fit(df) return model
def task_7(data_io, train_data, test_data): # ---------------------- Your implementation begins------------------------ dt = DecisionTreeRegressor(labelCol="overall", featuresCol="features", maxDepth=5) model = dt.fit(train_data) predictions = model.transform(test_data) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = {'test_rmse': None} # Modify res: res['test_rmse'] = rmse # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_7') return res
def test_decision_tree_regressor(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [100, -10, 50] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeRegressor(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml( model, 'Sparkml Decision Tree Regressor', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressor") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def decision_tree_regression(train_data, test_data): dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV') dt_model = dt.fit(train_data) dt_predictions = dt_model.transform(test_data) dt_evaluator = RegressionEvaluator( labelCol='MEDV', predictionCol='prediction', metricName='rmse', ) rmse = dt_evaluator.evaluate(dt_predictions) print('Root Mean Squared Error (RMSE) on test data = %g' % rmse) print(dt_model.featureImportances)
def task_8(data_io, train_data, test_data): # ---------------------- Your implementation begins------------------------ trainingData, testData = train_data.randomSplit([0.75, 0.25]) best = 0 all_rmse = [] lowest_rmse = 100 for i in [5, 7, 9, 12]: dt = DecisionTreeRegressor(labelCol="overall", featuresCol="features", maxDepth=i) model = dt.fit(trainingData) predictions = model.transform(testData) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) all_rmse = all_rmse + [rmse] if rmse <= lowest_rmse: lowest_rmse = rmse best = i best_model = model predictions = best_model.transform(test_data) evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'test_rmse': None, 'valid_rmse_depth_5': None, 'valid_rmse_depth_7': None, 'valid_rmse_depth_9': None, 'valid_rmse_depth_12': None, } # Modify res: res['test_rmse'] = rmse res['valid_rmse_depth_5'] = all_rmse[0] res['valid_rmse_depth_7'] = all_rmse[1] res['valid_rmse_depth_9'] = all_rmse[2] res['valid_rmse_depth_12'] = all_rmse[3] # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_8') return res
def decisionTreeRegressor(data, ncolumns, schemaNames): from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.tuning import CrossValidator from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import Binarizer from pyspark.ml.evaluation import BinaryClassificationEvaluator import numpy as np import time binarizer = Binarizer( threshold=0.00001, inputCol="features", outputCol="binarized_features", ) binarizedDataFrame = binarizer.transform(data) (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50) dtr = DecisionTreeRegressor(labelCol="label", featuresCol="binarized_features", maxDepth=10, maxBins=10, impurity='Variance') timer = '' start = time.time() cvModel = dtr.fit(trainingData) end = time.time() timer = ((end - start) / 60) prediction = cvModel.transform(testData) evaluator = RegressionEvaluator\ (labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(prediction) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") areaUC = evaluator.evaluate(prediction) fi = cvModel.featureImportances imp_feat = np.zeros(ncolumns - 1) imp_feat[fi.indices] = fi.values x = np.arange(ncolumns - 1) idx = (-imp_feat).argsort()[:3] feat = [] for i in idx: feat.append(schemaNames[i]) return feat, rmse, areaUC, timer
def score_dt(split_input_train_df, split_input_validation_df, model_evaluator): global model_rmse, model_dict, model_count print( "###################### Decision Tree Regression #########################" ) dt_regressor = DecisionTreeRegressor(featuresCol='features', labelCol='total_delivery_duration') print("CrossValidation...") dt_paramGrid = ParamGridBuilder()\ .addGrid(dt_regressor.maxBins, [5700, 6000])\ .addGrid(dt_regressor.maxMemoryInMB, [256, 512])\ .build() dt_cross_val = CrossValidator(estimator=dt_regressor, estimatorParamMaps=dt_paramGrid, evaluator=model_evaluator, numFolds=3) print("Done") print("Fitting training data...") dt_cv_model = dt_cross_val.fit(split_input_train_df) print("Done") print("Evaluating on validation data...") rmse = model_evaluator.evaluate( dt_cv_model.transform(split_input_validation_df)) model_rmse.append(rmse) model_count += 1 model_dict[model_count] = {} model_dict[model_count]["DT"] = dt_cv_model print("RMSE on validation data: %f" % rmse)
def model_train(input,model_path): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input,schema= tmax_schema) train, validation = data.randomSplit([0.75,0.25]) train = train.cache() validation = validation.cache() sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" transformer = SQLTransformer(statement=sql_query) assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features') regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax') weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor]) model = weather_pipeline.fit(train) model.write().overwrite().save(model_path) prediction = model.transform(validation) #Scoring the model evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse') score = evaluator.evaluate(prediction) print("Score of the weather model is",score)
def trainAndEvalModelByDecisionTreeRegressor(stages, train_df, test_df, evaluator): ''' 使用 DecisionTreeRegressor 决策树回归建立机器学习Pipeline流程进行模型训练和评估 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' print( '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练 =======================' ) dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features') dtPipeline = Pipeline(stages=stages + [dt]) # print(str(dtPipeline.getStages())) dtPipelineModel = dtPipeline.fit(train_df) bestModel = dtPipelineModel.stages[2] # print(bestModel.toDebugString) print( '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 =======================' ) predicts = dtPipelineModel.transform(test_df) # print(str(predicts.columns)) # 预测后新增的字段:'aFeatures', 'features', 'prediction' predicts.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt', 'prediction').show(10) rmse = evaluator.evaluate(predicts) print( '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse=' + str(rmse) + ') =======================') return (bestModel, predicts, rmse)
def prepare_spark_pipeline_for_DT(): print('----------Preparing spark pipeline for DT----------') label_indexer = StringIndexer(inputCol="price", outputCol="label", handleInvalid="keep") vector_assembler = VectorAssembler(inputCols=features, outputCol="unscaled_features") standard_scaler = StandardScaler(inputCol="unscaled_features", outputCol="features") DT_model = DecisionTreeRegressor(maxDepth=8) stages = [label_indexer, vector_assembler, standard_scaler, DT_model] pipeline = Pipeline(stages=stages) estimator_param = ParamGridBuilder().addGrid(DT_model.maxDepth, [8, 16]).addGrid( DT_model.impurity, ["variance"]).build() eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse") return CrossValidator(estimator=pipeline, estimatorParamMaps=estimator_param, evaluator=eval, numFolds=3), eval
def trainAndEvalModelByDecisionTreeRegressorAndCrossValidator( stages, train_df, test_df, evaluator): ''' 使用 DecisionTreeRegressor 决策树回归和 CrossValidator 建立机器学习Pipeline流程进行模型训练和验证,并找出最佳模型 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' print( '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练 =======================' ) dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features') paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [ 5, 10, 15, 25 ]).addGrid(dt.maxBins, [25, 35, 45, 50]).build( ) # 执行模型参数训练 4*4=16次,其中impurity="variance"固定不变,不用再参与训练,由于在line:108,创建 vectorIndexer 时,设置了maxCategories=24,因此这里maxBins要大于24 cv = CrossValidator(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3) cvPipeline = Pipeline(stages=stages + [cv]) cvPipelineModel = cvPipeline.fit(train_df) bestModel = cvPipelineModel.stages[2].bestModel print( '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 =======================' ) predicts = cvPipelineModel.transform(test_df) rmse = evaluator.evaluate(predicts) print( '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse=' + str(rmse) + ') =======================') return (bestModel, predicts, rmse)
def create_model(training_data, features_col, label_col, max_bins=32): """ Create machine learning model :param training_data: -- dataframe: training dataset :param features_col: -- col: containing all the features needed. :param label_col: -- col: label :param max_bins: -- integer: number of bins needed for :return: model created and its evaluator """ # Create Decision Tree Model dt = DecisionTreeRegressor() # Create params for the model params = ParamGridBuilder().baseOn({ dt.featuresCol: features_col }).baseOn({ dt.labelCol: label_col }).addGrid(dt.maxDepth, [3, 5, 7]).addGrid( dt.maxBins, [32 if max_bins <= 32 else max_bins + 1]).build() # Model Evaluator dt_evaluator = RegressionEvaluator(labelCol=label_col) # Create model with Cross Validation to get the best results dt_cv = CrossValidator(estimator=dt, estimatorParamMaps=params, evaluator=dt_evaluator) dt_cv_model = dt_cv.fit(training_data) return dt_cv_model, dt_evaluator
def dtr(self): # Load and parse the data file, converting it to a DataFrame. data = self.session.read.format("libsvm").load(self.dataDir + "/data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GBT model. drg = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and GBT in a Pipeline pipeline = Pipeline(stages=[featureIndexer, drg]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) gbtModel = model.stages[1] print(gbtModel) # summary only
def train(data, max_depth, max_bins, model_name, log_as_mleap, log_as_onnx): (trainingData, testData) = data.randomSplit([0.7, 0.3], 42) print("testData.schema:") testData.printSchema() # MLflow - log parameters print("Parameters:") print(" max_depth:", max_depth) print(" max_bins:", max_bins) mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_bins", max_bins) # Create pipeline dt = DecisionTreeRegressor(labelCol=colLabel, featuresCol=colFeatures, maxDepth=max_depth, maxBins=max_bins) assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol=colFeatures) pipeline = Pipeline(stages=[assembler, dt]) # Fit model and predict model = pipeline.fit(trainingData) predictions = model.transform(testData) # MLflow - log metrics print("Metrics:") predictions = model.transform(testData) metrics = ["rmse", "r2", "mae"] for metric_name in metrics: evaluator = RegressionEvaluator(labelCol=colLabel, predictionCol=colPrediction, metricName=metric_name) metric_value = evaluator.evaluate(predictions) print(f" {metric_name}: {metric_value}") mlflow.log_metric(metric_name, metric_value) # MLflow - log spark model mlflow.spark.log_model(model, "spark-model", \ registered_model_name=None if not model_name else f"{model_name}") # MLflow - log as MLeap model if log_as_mleap: scoreData = testData.drop("quality") mlflow.mleap.log_model(spark_model=model, sample_input=scoreData, artifact_path="mleap-model", \ registered_model_name=None if not model_name else f"{model_name}_mleap") # Log MLeap schema file for MLeap runtime deserialization schema_path = "schema.json" with open(schema_path, 'w') as f: f.write(scoreData.schema.json()) print("schema_path:", schema_path) mlflow.log_artifact(schema_path, "mleap-model") # MLflow - log as ONNX model if log_as_onnx: import onnx_utils scoreData = testData.drop("quality") onnx_utils.log_model(spark, model, "onnx-model", model_name, scoreData)
def model_define(self): """Returns a model with the hyperparameters inputted in :func: `get_parameters` Returns: (pyspark.ml.regression.DecisionTreeRegressor): Decision Tree Regression model """ return DecisionTreeRegressor()
def test_decisiontree_regressor(self): dt = DecisionTreeRegressor(maxDepth=1) path = tempfile.mkdtemp() dtr_path = path + "/dtr" dt.save(dtr_path) dt2 = DecisionTreeClassifier.load(dtr_path) self.assertEqual(dt2.uid, dt2.maxDepth.parent, "Loaded DecisionTreeRegressor instance uid (%s) " "did not match Param's uid (%s)" % (dt2.uid, dt2.maxDepth.parent)) self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], "Loaded DecisionTreeRegressor instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def DT_Algorithm(tr,te,featureIndexer): # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol='positive_rating_ratio') # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[featureIndexer, dt]) paramGrid=ParamGridBuilder().addGrid(dt.maxDepth, [5, 10, 15]) \ .addGrid(dt.minInstancesPerNode, [1, 5, 10]).build() print("---------------------Decision Tree Regression---------------------") predict(tr, te, pipeline, paramGrid, False)
def estimator_pipeline(train_dataframe, test_dataframe): random.seed(0) #вектор features vector = VectorAssembler(inputCols=train_dataframe.columns[:-1], outputCol='features') #estimator LR с параметрами из задания estimator_LR = LinearRegression(featuresCol='features', labelCol='ctr', maxIter=40, regParam=0.4, elasticNetParam=0.8) #другие эстиматоры с параметрами по умолчанию estimator_DT = DecisionTreeRegressor(featuresCol='features', labelCol='ctr') estimator_RF = RandomForestRegressor(featuresCol='features', labelCol='ctr') estimator_GB = GBTRegressor(featuresCol='features', labelCol='ctr') #evaluator RMSE_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='ctr', metricName='rmse') #спсиок моделей и непосредственно результаты будем записывать в списки models_ = [] RMSE_result = [] #обучаем все эстиматоры for est_r in [estimator_LR, estimator_DT, estimator_RF, estimator_GB]: #задаем pipline обучения (2 стадии, в реальности - доп.ступени отчистки и предобработки данных) pipeline = Pipeline(stages=[vector, est_r]) #делаем fit для Pipline по тренировочному датасету (создаем вектор, обучаем эстиматор) model = pipeline.fit(train_dataframe) #добавляем модель в список models_.append(model) # #сохраняем модель (по заданию) - можем сохранть модель в цикле с uid, но тогда нет понимания как правильно # #обращатсья к модели через PipelineModel.load из PySparkMLPredict т.к. uid будет постоянно меняться # #по этому сохраняем вс модели в список и далее для каждую модель сохраняем с определенным названием # model.save(est_r.uid) #делаем прогноз по тестовому датасету prediction = pipeline.fit(train_dataframe).transform(test_dataframe) #считаем метрику RMSE для тестового датасета RMSE = round(RMSE_evaluator.evaluate(prediction), 4) #записываем результат в массив для отображения в консоли RMSE_result.append(RMSE) #сохранение моделей for pair in zip(models_, ['LR_model', 'DT_model', 'RF_model', 'GB_model']): pair[0].save(pair[1]) return models_, RMSE_result
def TrainDT(trainingData, testData): # Train a DecisionTree model. dt = DecisionTreeRegressor() # Train model. This also runs the indexer. start = time.time() model = dt.fit(trainingData) end = time.time() print('Training DT model took', end - start) # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2 = evaluator.evaluate(predictions) print("R2 on test data = %g" % r2) # Make predictions for train predictions = model.transform(trainingData) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on train data = %g" % rmse) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2 = evaluator.evaluate(predictions) print("R2 on train data = %g" % r2) return model
def decision_tree_regression(trainingDataFrame, maxCategories=4): featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=maxCategories).fit(trainingDataFrame) dt = DecisionTreeRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, dt]) dtModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = dtModel result["summary"] = dtModel.stages[1] return result
def get_best_weather_model(data): train, test = data.randomSplit([0.75, 0.25]) train = train.cache() test = test.cache() # e.g., use print(LinearRegression().explainParams()) to see what can be tuned estimator_gridbuilders = [ estimator_gridbuilder( DecisionTreeRegressor(), dict( maxDepth=[10], minInstancesPerNode=[2], minInfoGain=[0.5], )), estimator_gridbuilder( LinearRegression(), dict( regParam=[0.2], # [0.1, 0.01] elasticNetParam=[.8], # 0-L2, 1-L1 aggregationDepth=[5], tol=[0.000005], maxIter=[100])), estimator_gridbuilder( RandomForestRegressor(), dict(featureSubsetStrategy=["onethird"], maxDepth=[10], numTrees=[40])), estimator_gridbuilder( GBTRegressor(), dict( maxIter=[20], maxDepth=[10], lossType=['squared'], )), # TODO: find better estimators ] metricName = 'r2' tvs_list = make_weather_trainers( .2, # fraction of data for training estimator_gridbuilders, metricName) ev = tvs_list[0].getEvaluator() scorescale = 1 if ev.isLargerBetter() else -1 model_name_scores = [] for tvs in tvs_list: model = tvs.fit(train) test_pred = model.transform(test) score = ev.evaluate(test_pred) * scorescale model_name_scores.append( (model, get_estimator_name(tvs.getEstimator()), score)) best_model, best_name, best_score = max(model_name_scores, key=lambda triplet: triplet[2]) print("Best model is %s with validation data %s score %f" % (best_name, ev.getMetricName(), best_score * scorescale)) return best_model
def tree_builder(self, data, algorithm): print(algorithm) if algorithm == 2: regressor = DecisionTreeRegressor(featuresCol='features', labelCol=TARGET_VARIABLE, impurity='variance') model_path = "decision_tree" elif algorithm == 3: regressor = RandomForestRegressor(featuresCol='features', labelCol=TARGET_VARIABLE, numTrees=4) model_path = "random_forest" print("[REG] number of trees: ", regressor.getNumTrees()) param_grid = ParamGridBuilder().addGrid(regressor.maxDepth, MAX_DEPTH_OPTIONS) \ .addGrid(regressor.maxBins, MAX_BINS_OPTIONS) \ .build() # learning on training data self.learn_from_training_data(data, regressor, model_path, param_grid)
def test_pyspark_regression_decision_tree(): try: import pyspark import sklearn.datasets from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.ml.feature import VectorAssembler, StringIndexer from pyspark.ml.regression import DecisionTreeRegressor, GBTRegressor, RandomForestRegressor import pandas as pd iris_sk = sklearn.datasets.load_iris() iris = pd.DataFrame(data=np.c_[iris_sk['data'], iris_sk['target']], columns=iris_sk['feature_names'] + ['target'])[:100] spark = SparkSession.builder.config( conf=SparkConf().set("spark.master", "local[*]")).getOrCreate() except: print("Skipping test_pyspark_regression_decision_tree!") return import shap import numpy as np # Simple regressor: try to predict sepal length based on the other features col = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "type" ] iris = spark.createDataFrame(iris, col).drop("type") iris = VectorAssembler(inputCols=col[1:-1], outputCol="features").transform(iris) regressors = [ GBTRegressor(labelCol="sepal_length", featuresCol="features"), RandomForestRegressor(labelCol="sepal_length", featuresCol="features"), DecisionTreeRegressor(labelCol="sepal_length", featuresCol="features") ] for regressor in regressors: model = regressor.fit(iris) explainer = shap.TreeExplainer(model) X = pd.DataFrame(data=iris_sk.data, columns=iris_sk.feature_names).drop( 'sepal length (cm)', 1)[:100] # pylint: disable=E1101 shap_values = explainer.shap_values(X) expected_values = explainer.expected_value # validate values sum to the margin prediction of the model plus expected_value predictions = model.transform(iris).select("prediction").toPandas() diffs = expected_values + shap_values.sum( 1) - predictions["prediction"] assert np.max(np.abs( diffs)) < 1e-4, "SHAP values don't sum to model output for class0!" assert (np.abs(expected_values - predictions.mean()) < 1e-1).all(), "Bad expected_value!" spark.stop()
def func2(): """ 使用K折交叉验证 :return: """ hour_df = sqlContext.read.format("csv").option( "header", "true").load(Path + "hour.csv") # 舍弃不需要的字段 hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop( "casual").drop("registered") # 数据转换为double hour_df = hour_df.select([ col(column).cast("double").alias(column) for column in hour_df.columns ]) # 将数据分为train_df和test_df,比例为0.7:0.3 train_df, test_df = hour_df.randomSplit([0.7, 0.3]) train_df.cache() test_df.cache() # 创建特征字段list featureCols = hour_df.columns[:-1] # 建立pipeline vectorAssembler = VectorAssembler(inputCols=featureCols, outputCol="aFeatures") vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24) dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features") dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt]) # 训练 dt_pipelineModel = dt_pipeline.fit(dataset=train_df) # 使用transform预测 predicted = dt_pipelineModel.transform(test_df) ###评估模型 evaluator = RegressionEvaluator(labelCol="cnt", predictionCol="prediction", metricName="rmse") predicted_df = dt_pipelineModel.transform(test_df) rmse = evaluator.evaluate(predicted_df) ##TrainValidationSplit训练找出最佳模型 paramGrid = ParamGridBuilder().addGrid( dt.impurity, ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid( dt.maxBins, [10, 15, 20]).build() cv = CrossValidator(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3) cv_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv]) cv_pipelineModel = cv_pipeline.fit(dataset=train_df) ##使用最佳模型进行预测 predictions = cv_pipelineModel.transform(test_df) rmse2 = evaluator.evaluate(predictions) print(rmse2)
def decision_tree_regressor(spark, original_label_col, feature_col_names): # Create two columns, 'label' and 'features'. Label is true or false, features is a vector of values. label_col = "label" vector_col = "features" dt = DecisionTreeRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures") evaluator = RegressionEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="mae") return run_model(spark, original_label_col, label_col, vector_col, feature_col_names, dt, [evaluator])
def DTR(df_data): # Train a DecisionTree model. print("Train a DecisionTree model...") t1 = time.time() dt = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[data.feature_indexer(df_data), dt]) # Train model. This also runs the indexer. dtr_model = pipeline.fit(df_data) t2 = time.time() - t1 print("dt_model using time: %.2fs\n" % t2) return dtr_model
def decisionTreeRegression(df,arguments): from pyspark.ml.regression import DecisionTreeRegressor maxDepth = 5 minInstancesPerNode = 1 impurity = "variance" if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.impurity != None: impurity = arguments.impurity dt = DecisionTreeRegressor(maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, impurity=impurity) model = dt.fit(df) return model
def que1(): for i, ct in enumerate([ DecisionTreeClassifier(seed=9008), DecisionTreeRegressor(predictionCol="prediction_c", seed=9008), LogisticRegression() ]): binarizer = None if i == 0: print("[*] DecisionTree Classifier") paramB = ParamGridBuilder().addGrid( ct.maxDepth, [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid( ct.impurity, ["gini", "entropy"]).build() continue elif i == 1: print("[*] DecisionTree Regressor") paramB = ParamGridBuilder().addGrid( ct.maxDepth, [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid( ct.minInfoGain, [0.0, 0.25, 0.3]).build() binarizer = Binarizer(threshold=0.5, inputCol="prediction_c", outputCol="prediction") else: print("[*] Logistic Regression") paramB = ParamGridBuilder().addGrid(ct.maxIter, [5, 10, 15]).addGrid( ct.regParam, [0.05, 0.1, 0.5]).build() if binarizer is not None: pipeline = Pipeline(stages=[ct, binarizer]) else: pipeline = Pipeline(stages=[ct]) print("[*] Running for areaUnderROC") bp, metric_roc = run_metric( s_train, s_test, pipeline, paramB, BinaryClassificationEvaluator(rawPredictionCol="prediction", metricName="areaUnderROC")) print("[*] Done for areaUnderROC") print("[*] Best Params: %s, AreaUnderROC value: %f" % (bp, metric_roc)) print("[*] Running for accuracy") mp, metric_acc = run_metric( s_train, s_test, pipeline, paramB, MulticlassClassificationEvaluator(predictionCol="prediction", metricName="accuracy")) print("[*] Done for accuracy") print("[*] Best Params: %s, Accuracy value: %f" % (mp, metric_acc))
sepalPredictions = sepalModels[0].transform(irisSepal) print regEval.evaluate(sepalPredictions, {regEval.metricName: 'r2', regEval.labelCol: 'sepalWidth'}) print regEval.evaluate(sepalPredictions, {regEval.metricName: 'rmse', regEval.labelCol: 'sepalWidth'}) # COMMAND ---------- # MAGIC %md # MAGIC #### Regression with decision trees # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor().setLabelCol('petalWidth') print dtr.explainParams() # COMMAND ---------- dtrModel = dtr.fit(irisPetal) dtrPredictions = dtrModel.transform(irisPetal) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'}) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'}) # COMMAND ---------- # MAGIC %md # MAGIC Let's also build a gradient boosted tree. # COMMAND ----------
def spark_process(sqlContext, sc, validate, path_to_file): ###################### # # HDFS to DataFrame # ###################### ## all fields: # ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', # 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', # 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', # 'tolls_amount', 'total_amount'] # columns to select feature_columns = [1,2,3,5,6,9,10] # read file and convert to DataFrame # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache() customSchema = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", StringType(), True), StructField("trip_distance", StringType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("rate_code", StringType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", StringType(), True), StructField("surcharge", StringType(), True), StructField("mta_tax", StringType(), True), StructField("tip_amount", StringType(), True), StructField("tolls_amount", StringType(), True), StructField("total_amount", StringType(), True) ]) dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file) # create dataframe with selected columns dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns)) # this number does not include the header # number_of_trips = dataframe.count() sqlContext.clearCache() ###################### # # Preprocess data # ###################### # filter rows with null fields # if passenger count is missing assign it a value of 1 # filter invalid location: keep only areas near NYC dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \ .fillna(1,subset=["passenger_count"]) \ .filter(dataframe.pickup_latitude>40.0) \ .filter(dataframe.pickup_latitude<41.0) \ .filter(dataframe.pickup_longitude<-73.0) \ .filter(dataframe.pickup_longitude>-74.0) \ .filter(dataframe.dropoff_latitude>40.0) \ .filter(dataframe.dropoff_latitude<41.0) \ .filter(dataframe.dropoff_longitude<-73.0)\ .filter(dataframe.dropoff_longitude>-74.0) ###################### # # features engineering # ###################### # create new column based on time-delta (minutes) # convert pickup-datetime column to hour time_delta_udf = udf(time_delta_minutes,FloatType()) dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \ .withColumn('pick_up_hour', hour(dataframe.pickup_datetime)) dataframe = dataframe.select(dataframe.pick_up_hour, \ dataframe.passenger_count.cast("integer"), \ dataframe.pickup_longitude.cast("double"), \ dataframe.pickup_latitude.cast("double"), \ dataframe.dropoff_longitude.cast("double"),\ dataframe.dropoff_latitude.cast("double"), \ dataframe.time_delta.cast("double")) dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache() # split dataframe into feature and label vector # create feature vectors and labels for model training feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features') transformed = feature_assembler.transform(dataframe) vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache() ###################### # # train model # ###################### if validate: ################################ # # validate model on 60/40 split # ################################ # split training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0) decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(training) train_pred = model.transform(training) test_pred = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_train = evaluator.evaluate(train_pred) evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_test = evaluator_test.evaluate(test_pred) output = test_pred.select("prediction", "label", "features") return output, r2_test, r2_train else: ################### # # train on all data # ################### decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(vector_dataframe) predictions = model.transform(vector_dataframe) output = predictions.select("prediction", "label", "features") ########################### # # process to send to Kafka # ########################### schema = StructType([StructField("prediction_mins", FloatType(), True), StructField("pick_up_hour", IntegerType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True)]) features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect() sqlContext.clearCache() dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache() return dataframe_from_prediction_vector
categoricalColumns = ['store_and_fwd_flag'] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] #encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend'] encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type'] for eCol in encColumns: encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec") stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32) model = dt.fit(dataset) model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")
from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams() glrModel = glr.fit(df) # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor() print rf.explainParams() rfModel = rf.fit(df) gbt = GBTRegressor() print gbt.explainParams() gbtModel = gbt.fit(df)