def test_multi_target_random_forest(): import shap import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.linnerud(), test_size=0.2, random_state=0) est = RandomForestRegressor(random_state=202, n_estimators=10, max_depth=10) est.fit(X_train, Y_train) predicted = est.predict(X_test) explainer = shap.TreeExplainer(est) expected_values = np.asarray(explainer.expected_value) assert len( expected_values ) == est.n_outputs_, "Length of expected_values doesn't match n_outputs_" shap_values = np.asarray(explainer.shap_values(X_test)).reshape( est.n_outputs_ * X_test.shape[0], X_test.shape[1]) phi = np.hstack((shap_values, np.repeat(expected_values, X_test.shape[0]).reshape(-1, 1))) assert np.allclose(phi.sum(1), predicted.flatten(order="F"), atol=1e-4)
def spark_ml(): diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID')) diff_cat_in_train_test.distinct().count() from pyspark.ml.feature import StringIndexer plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID') labeller = plan_indexer.fit(train) Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor() (train_cv, test_cv) = train1.randomSplit([0.7, 0.3]) model1 = rf.fit(train_cv) predictions = model1.transform(test_cv) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator() mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" }) import numpy as np np.sqrt(mse), mse model = rf.fit(train1) predictions1 = model.transform(test1) df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase') df.toPandas().to_csv('submission.csv')
def main(): parser = argparse.ArgumentParser(description='Pyspark Training') parser.add_argument( '--data', type=str, default="../../../data/sample_linear_regression_data.txt", help='Data location.') args = parser.parse_args() data = spark.read.format("libsvm").load(args.data) # Split the data into training and test sets (30% held out for testing) (train, test) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor() # Train model. This also runs the indexer. model = rf.fit(train) # Make predictions. predictions = model.transform(test) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
def randomForestRegression(df,arguments): from pyspark.ml.regression import RandomForestRegressor maxDepth = 5 minInstancesPerNode = 1 numTrees = 20 impurity = "variance" if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.numTrees != None: numTrees = float(arguments.numTrees) if arguments.impurity != None: impurity = arguments.impurity rf = RandomForestRegressor(numTrees=numTrees, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, impurity=impurity) model = rf.fit(df) return model
def main(): # Set bounds for random forest's hyperparameters hparams = [(2, 25), # num_trees (2, 6), # max_depth (15, 30)] # max_bins # Run hyperparameter optimization using Gaussian processes optim_results = gp_minimize(objective, hparams, n_calls=20, verbose=True, random_state=0) print('\nHyperparameter Optimization Results:') print('Best validation RMSE = {}'.format(optim_results.fun)) # Get best hyperparameters from optimization num_trees = optim_results.x[0] max_depth = optim_results.x[1] max_bins = optim_results.x[2] # Instantiate a RandomForest model using best hyperparameter settings rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins) # Train model. model = rf.fit(train) # Make predictions. predictions = model.transform(test) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print('\nFinal Results on Test Set with Optimized Hyperparameters:') print("Root Mean Squared Error on test set = %g" % rmse)
def _getBasePredictor(self, randomSeed): f = open(self._baseDataPath, "w") f.truncate() f.close() self._lowTrainData = self._trainData.sample(fraction=self._lowRatio, seed=randomSeed).cache() self._midTrainData = self._trainData.sample(fraction=self._midRatio, seed=randomSeed).cache() cs = self.getPCS() scenario = Scenario({ "run_obj": "quality", "runcount-limit": self._BPDS, "cs": cs, "deterministic": "true" }) # Optimize, using a SMAC-object smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=self._baseEval) smac.optimize() df = self._spark.read.format("libsvm").load(self._baseDataPath) rf = RandomForestRegressor() rfModel = rf.fit(df) self._lowTrainData.unpersist() self._midTrainData.unpersist() return rfModel
def model_dev_rf(df_train, df_test, n_trees, max_bins, max_depth): rf_start_time = time() # Create an Initial Model Instance mod_rf = RandomForestRegressor(labelCol='label', featuresCol='features', impurity='variance', featureSubsetStrategy='all', numTrees=n_trees, maxBins=max_bins, maxDepth=max_depth) # Training The Model rf_final_model = mod_rf.fit(df_train) # Scoring The Model On Test Sample rf_transformed = rf_final_model.transform(df_test) rf_test_results = rf_transformed.select(['prediction', 'label']) # Collecting The Model Statistics rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label") rf_r2 = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "r2"}), 3) rf_mse = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "mse"}), 3) rf_rmse = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "rmse"}), 3) rf_mae = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "mae"}), 3) # Printing The Model Statitics print("\n++++++ Printing Random Forest Model Accuracy ++++++\n") print("R Square: " + str(rf_r2 * 100) + "%") print("Mean Squared Error: " + str(rf_mse)) print("Root Mean Squared Error: " + str(rf_rmse)) print("Mean Absolute Error: " + str(rf_mae)) rf_end_time = time() rf_elapsed_time = (rf_end_time - rf_start_time) / 60 rf_model_stat = pd.DataFrame({ "Model Name": ["Random Forest"], "R Square": rf_r2, "Mean Squared Error": rf_mse, "Root Mean Squared Error": rf_rmse, "Mean Absolute Error": rf_mae, "Time (Min.)": round(rf_elapsed_time, 3) }) rf_output = (rf_final_model, rf_model_stat) return (rf_output)
def testRegression(train, test): # Train a RandomForest model. # Note: Use larger numTrees in practice. rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def RF(trainingData, testData): """ Random Forest Tree Regression Model :param trainingData: :param testData: :param args :return: Trained model, predictions, nt (int), md (int) """ nt,md=120,20 rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\ impurity='variance', maxDepth=md, maxBins=100) #120,20 model = rf.fit(trainingData) predictions = model.transform(testData) return model, predictions, nt, md
def RF(trainingData, testData, args): """ Random Forest Tree Regression Model :param trainingData: :param testData: :param args :return: Trained model, predictions, nt (int), md (int) """ if (args.descriptor == 'CM' or args.descriptor == 'CMSE' or args.descriptor == 'Morgan2DCMSE'): nt, md = 50, 14 elif (args.descriptor == 'Morgan2D' or args.descriptor == 'Morgan2DSE' or args.descriptor == 'Morgan2DSEext'): nt, md = 120, 20 rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\ impurity='variance', maxDepth=md, maxBins=100) #120,20 model = rf.fit(trainingData) predictions = model.transform(testData) return model, predictions, nt, md
def rfRegressor(df): df = df.withColumn('tmp_price', df['price']) df = df.drop('price') df = df.withColumnRenamed('tmp_price', 'price') feature_label = df.rdd.map(lambda x: (Vectors.dense( [float(i) for i in x[0:-1]]), float(x[-1]))).toDF( ["features", "label"]) (trainingData, testData) = feature_label.randomSplit([0.7, 0.3]) rf = RandomForestRegressor() model = rf.fit(trainingData) importance_map_df = importance_features_map(df, model, 'price') # Make predictions. predictions = model.transform(testData) predict_df = predictions.select("prediction", "label") predict_df = predict_df.withColumn( 'rate', (predict_df['prediction'] - predict_df['label']) / predict_df['label']) def udf_rate(s): return round(abs(s), 3) udf_rate = udf(udf_rate) predict_df = predict_df.select( '*', udf_rate(predict_df['rate']).alias('rates')).drop('rate') predict_df.show() model.save("/root/myModelPath1") sameModel = RandomForestRegressionModel.load("/root/myModelPath1") same_predict_df = sameModel.transform(testData) print('=======================================') same_predict_df.show() return importance_map_df, model
def featureAnalysis(self, etlStats, algoName): numericalFeatures = etlStats.get(PredictiveConstants.NUMERICALFEATURES) label = etlStats.get(PredictiveConstants.LABELCOLM) dataset = etlStats.get(PredictiveConstants.DATASET) featuresColm = etlStats.get(PredictiveConstants.FEATURESCOLM) indexedFeatures = etlStats.get(PredictiveConstants.INDEXEDFEATURES) maxCategories = etlStats.get(PredictiveConstants.MAXCATEGORIES) categoricalFeatures = etlStats.get( PredictiveConstants.CATEGORICALFEATURES) trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40) keyStatsTest = '' statisticalTestResult = {} randomForestModel = object if algoName == PredictiveConstants.RANDOMREGRESSOR: statisticalTestObj = PredictiveStatisticalTest( dataset=dataset, features=numericalFeatures, labelColm=label) statisticalTestResult = statisticalTestObj.pearsonTest() randomForestModel = \ RandomForestRegressor(labelCol=label, featuresCol=featuresColm, numTrees=10) keyStatsTest = "pearson_test_data" if algoName == PredictiveConstants.RANDOMCLASSIFIER: statisticalTestObj = PredictiveStatisticalTest( dataset=dataset, features=indexedFeatures, labelColm=label) statisticalTestResult = \ statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures, maxCategories=maxCategories) randomForestModel = RandomForestClassifier( labelCol=label, featuresCol=featuresColm, numTrees=10) keyStatsTest = "ChiSquareTestData" randomForestModelFit = randomForestModel.fit(trainData) featureAnalysis = { PredictiveConstants.RANDOMFORESTMODEL: randomForestModelFit, PredictiveConstants.KEYSTATSTEST: keyStatsTest, PredictiveConstants.STATISTICALTESTRESULT: statisticalTestResult } return featureAnalysis
def randomForestRegressorModel(self): randomForestRegressorModelFit = \ RandomForestRegressor(labelCol=self.labelColm, featuresCol=self.featuresColm, numTrees=10,predictionCol=self.modelSheetName) regressor = randomForestRegressorModelFit.fit(self.trainData) # predictionData = regressor.transform(self.testData) regressionStat = self.randomGradientRegressionModelEvaluation(regressor=regressor) # persisting model modelName = "randomForestModel" extention = ".parquet" modelStorageLocation = self.locationAddress + self.userId.upper() + modelName.upper() + extention regressor.write().overwrite().save(modelStorageLocation) regressionStat["modelPersistLocation"] = {"modelName": modelName, "modelStorageLocation": modelStorageLocation} return regressionStat
def objective(hparams): """ Objective function to be minimized: Model validation RMSE loss as a function of our model hyperparameters. Parameters: ---------- * `hparams` [list] Hyperparameter settings determined by Bayesian optimization loop. Returns: ------- * `rmse` [float] Root mean squared error on the validation set Reference: --------- Bayesian optimization with Scikit-Optimize: https://scikit-optimize.github.io/ """ # New hyperparameter settings from Bayesian optimization num_trees, max_depth, max_bins = hparams # Instantiate a RandomForest model. rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins) # Train model. model = rf.fit(train) # Make predictions. predictions = model.transform(val) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) #print('Validation RMSE: {}'.format(rmse)) return rmse
def build_random_forest_regressor_model(observation_df, feature_columns): # Create new column with all of the features vector_observation_df = create_feature_column(observation_df, feature_columns, ['features', 'duration_sec']) train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3]) lr = RandomForestRegressor(featuresCol='features', labelCol='duration_sec') rfr_model = lr.fit(train_df) test_predictions = rfr_model.transform(test_df) test_predictions.select("prediction", "duration_sec", "features").show(5) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="accuracy") print("RMSE on test data = %g" % evaluator.evaluate(test_predictions)) # test_result = rfr_model.evaluate(test_df) return rfr_model
def RandomForestRegressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) model = rf.fit(df) model.featureImportances # SparseVector(1, {0: 1.0}) allclose(model.treeWeights, [1.0, 1.0]) # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 model.numFeatures # 1 model.trees # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] model.getNumTrees # 2 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 0.5 temp_path = "./" rfr_path = temp_path + "/rfr" rf.save(rfr_path) rf2 = RandomForestRegressor.load(rfr_path) rf2.getNumTrees() # 2 model_path = temp_path + "/rfr_model" model.save(model_path) model2 = RandomForestRegressionModel.load(model_path) model.featureImportances == model2.featureImportances
def randomForestRegression(self, regressionInfo): etlStats = self.etlOperation(etlInfo=regressionInfo) featuresColm = etlStats.get(PredictiveConstants.FEATURESCOLM) modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME) labelColm = etlStats.get(PredictiveConstants.LABELCOLM) trainData = etlStats.get(PredictiveConstants.TRAINDATA) locationAddress = regressionInfo.get( PredictiveConstants.LOCATIONADDRESS) modelId = regressionInfo.get(PredictiveConstants.MODELID) randomForestRegressorModelFit = \ RandomForestRegressor(labelCol=labelColm, featuresCol=featuresColm, numTrees=10, predictionCol=modelName) regressor = randomForestRegressorModelFit.fit(trainData) # predictionData = regressor.transform(self.testData) regressionStat = self.regressionEvaluation( regressor=regressor, regressionInfo=regressionInfo, etlStats=etlStats) # persisting model modelNameLocal = "randomForestModel" extention = ".parquet" modelStorageLocation = locationAddress + modelId.upper( ) + modelNameLocal.upper() + extention regressor.write().overwrite().save(modelStorageLocation) regressionStat["modelPersistLocation"] = { "modelName": modelNameLocal, "modelStorageLocation": modelStorageLocation } return regressionStat
df = df.selectExpr("fare_amount as label", 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count') new_df = vecAssembler.setHandleInvalid("skip").transform(df) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = new_df.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor() # Train model. start_time = datetime.now() model = rf.fit(trainingData) time_elapsed = datetime.now() - start_time print('TIME OF RANDOM FOREST TRAINING (hh:mm:ss.ms) {}'.format(time_elapsed)) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions)
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: # dataset = spark.read.parquet(dataset_add) dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=';') dataset.show() label = '' for y in label_colm: label = y print(label) # # summaryList = ['mean', 'stddev', 'min', 'max'] # summaryDict = {} # for colm in feature_colm: # summaryListTemp = [] # for value in summaryList: # summ = list(dataset.select(colm).summary(value).toPandas()[colm]) # summaryListTemp.append(summ) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) # summaryDict[colm] = summaryListTemp # summaryList.append('variance') # summaryDict['summaryName'] = summaryList # # print(summaryDict) # print(summaryDict) # varianceDict = {} # for colm in feature_colm: # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # varianceDict[colm] = varianceListTemp # print(varianceDict) # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict} # print(summaryAll) # extracting the schema schemaDataset = dataset.schema stringFeatures = [] numericalFeatures = [] for x in schemaDataset: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) print(stringFeatures) print(numericalFeatures) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListTemp.append(summ) varianceListTemp = list( dataset.select(variance( col(colm)).alias(colm)).toPandas()[colm]) summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures print(summaryDict) # print(val) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) # calling pearson test fuction response_pearson_test = Correlation_test_imp( dataset=dataset, features=numericalFeatures, label_col=label) # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label) # dataset_pearson = dataset # # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset) # dataset = label_indexer.transform(dataset) ########################################################################### indexed_features = [] encoded_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset) # encoded_features.append('encoded_'+colm) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = numericalFeatures + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # output.show() # output.select("features").show() # output_features = dataset.select("features") #using the vector indexer vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select(label, 'vec_indexed_features') finalized_data.show() # renaming the colm # print (label) # dataset.withColumnRenamed(label,"label") # print (label) # dataset.show() # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # # # # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestRegressor(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) # Convert indexed labels back to original labels. # Train model. This also runs the indexers. model = rf.fit(train_data) # Make predictions. predictions = model.transform(test_data) # Select example rows to display. # predictions.select("prediction", "label", "features").show(10) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': feature_importance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'pearson_test_data': response_pearson_test, 'summaryDict': summaryDict } return response_dict print(response_dict) # Select (prediction, true label) and compute test error # evaluator = MulticlassClassificationEvaluator( # labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test Error = %g" % (1.0 - accuracy)) # rfModel = model.stages[2] # print(rfModel) # summary only except Exception as e: print("exception is = " + str(e))
### from pyspark.ml.feature import PCA print("pca") df = PCA(k=300, inputCol="tfidf", outputCol="pca").fit(df).transform(df).select("pca", "overall") df.show() #df.show(truncate=False) ### from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor(numTrees=50, maxDepth=5, seed=42, labelCol='overall', featuresCol='pca', predictionCol='prediction') model = rf.fit(df) pred = model.transform(df) pred.show() from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction") print("r2", evaluator.evaluate(pred, {evaluator.metricName: "r2"})) print("mse", evaluator.evaluate(pred, {evaluator.metricName: "mse"}))
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
run.log("Max Bins", maxBins) run.log("Number of Trees", numTrees) run.log('Subsampling Rate', subsamplingRate) run.log_list("Feature Columns", feature_cols) ############### # TRAIN MODEL # ############### print(" * Training {0} model".format(model_name)) # Instantiate New RandomForestRegressor Object rf = RandomForestRegressor(labelCol='duration_minutes', maxDepth=maxDepth, maxBins=maxBins, impurity='variance', subsamplingRate=1.0, seed=random_seed, numTrees=numTrees, featureSubsetStrategy='auto') # Train model on transformed training data rf_model = rf.fit(trainDF_transformed) rf_full_model = feature_model.copy() rf_full_model.stages.append(rf_model) print(" * Model trained, scoring validation data") # Run the full model (feature steps and trained model) validation_scored = rf_full_model.transform(validDF) ##################### # MODEL PERFORMANCE # ##################### print(" * Calculating performance metrics") # Calculate Regression Performance rmse = evaluator.evaluate(validation_scored, {evaluator.metricName: "rmse"})
#Drop nulls modelDF = assembler.transform(df7.dropna()) #Cast label to double for regression model modelDF = modelDF.withColumn("label", modelDF["count"].cast("double")) #Split train and test set (train, test) = modelDF.randomSplit([0.8, 0.2]) #Build Random Forest Model rf_mod = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=100, maxDepth=4, maxBins=40) fitted = rf_mod.fit(train) #Get predictions for test set, and round them to integer values because it's a count. predictions = fitted.transform(test) predictions = predictions.withColumn("predictions", round(predictions.prediction, 0)) evaluator = RegressionEvaluator(predictionCol="predictions", labelCol="label", metricName="r2") pred = predictions.select("label", "predictions", "features").toPandas() #Save predictions pred.to_csv('gupta_3_predictions.csv', index=False) with open('gupta_3.txt', 'w') as output: output.write("Test R-Squared = " + str(evaluator.evaluate(predictions)))
arr.T, columns=['X_columns', 'importances_values']) return importance_map_df start = time.time() parquet_path = '/user/limeng/data/ganji_daxing.parquet' df, columns_list = read_parquet(parquet_path) print('=====================') df.show() (trainingData, testData) = df.randomSplit([0.7, 0.3]) rf = RandomForestRegressor(numTrees=20, maxDepth=15, impurity="variance") print('model_train_start======================') model = rf.fit(trainingData) model.save('/user/limeng/data/ganji_daxing_RF_model') #model = RandomForestRegressionModel.load('/user/limeng/data/fangtianxia_daxing_RF_model') predict_value = model.transform(testData) print('predict==============') predict_value.show(truncate=False) predict_value_rate = predict_value.rdd.map( lambda x: (x[1], x[2], abs(x[1] - x[2]) / x[1])).toDF( ['label', ' prediction', 'rsidual_rate']) print('predict_value_rate-----------------------------') predict_value_rate = predict_value_rate.sort("rsidual_rate", ascending=False) predict_value_rate.write.mode("overwrite").options( header="true").csv('/user/limeng/fangtianxia_daxing_predict_result')
r2_dtr = np.zeros(10) for i in np.arange(10): dtr = DecisionTreeRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.) dtrModel = dtr.fit(sample) prediction_dtr = dtrModel.transform(sample) r2_dtr[i] = evaluator.evaluate(prediction_dtr) plt.plot(np.arange(3, 33, 3), r2_dtr) # so choose 10 as the maxDepth # In[108]: # Random Forest r2_rfr = np.zeros(10) for i in np.arange(10): rfr = RandomForestRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.) rfrModel = rfr.fit(sample) prediction_rfr = rfrModel.transform(sample) r2_rfr[i] = evaluator.evaluate(prediction_rfr) plt.plot(np.arange(3, 33, 3), r2_rfr) # so select 10 as maxDepth # In[109]: # Gradient Boosted Trees r2_gbt = np.zeros(10) for i in np.arange(10): gbt = GBTRegressor(labelCol='mean_temp', maxIter=(i + 1) * 10.) gbtModel = gbt.fit(sample) prediction_gbt = gbtModel.transform(sample) r2_gbt[i] = evaluator.evaluate(prediction_gbt) plt.plot(np.arange(10, 105, 10), r2_gbt)
# In[160]: pred_list = [] for i in range(pred_num_period): va = VectorAssembler(outputCol='features', inputCols=train_spark_df.columns[:-pred_num_period]) label_col = 'pred_period_%d'%i train_va = va.transform(train_spark_df).select('features', label_col).withColumnRenamed(label_col, 'label').cache() val_va = va.transform(val_spark_df).select('features', label_col).withColumnRenamed(label_col, 'label').cache() train_va.count(); val_va.count(); rf = RandomForestRegressor(maxDepth=10, numTrees=10, maxBins=128) rfmodel = rf.fit(train_va) pred_val = rfmodel.transform(val_va) pred_list.append(pred_val.select('prediction').rdd.map(lambda x: x[0]).collect()) evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse") accuracy = evaluator.evaluate(pred_val) print 'RMSE for period %d: %.4f'%(i+1, accuracy) # In[161]: pred = np.stack(pred_list, axis=1) sc.stop()
final_data.head(1) # # split train/test # In[ ]: train_data, test_data = final_data.randomSplit([0.7, 0.3]) # # Model training # In[ ]: from pyspark.ml.regression import RandomForestRegressor model = RandomForestRegressor(numTrees=100) model = model.fit(train_data) # # model evaluation # In[ ]: model.featureImportances # In[ ]: from pyspark.ml.evaluation import RegressionEvaluator # In[ ]: test_results = model.transform(test_data)
sparkConf = SparkConf().setAppName("Yapay Ogrenme").setMaster("local[*]") sc = SparkContext(conf=sparkConf) spark = SparkSession.builder.appName("Yapay Ogrenme SQL").getOrCreate() sc.setLogLevel("ERROR") df = spark.read.format("csv").option("header", "true").option( "inferSchema", "true").csv("realestate.csv") df.printSchema() print "--------" df = df.na.fill(0, df.columns[1:]) dfR = df.drop("transactiondate").withColumnRenamed("logerror", "label") vecAssembler = VectorAssembler(inputCols=dfR.columns[1:-1], outputCol="features") dfWithFeatures = vecAssembler.transform(dfR) (trainingData, testData) = dfWithFeatures.randomSplit([0.7, 0.3]) trainingData.show() lr = RandomForestRegressor(featuresCol="features", labelCol="label", predictionCol="prediction") model = lr.fit(trainingData) predictionsDF = model.transform(testData) predictionsDF.drop("features").write.option("header", "true").csv("test.csv")
outputCol='Features') transformedEcommerceData = assembler.transform(ecommerceData) transformedEcommerceData.show() # %% #Preparing the data for the model by only having two columns: the features and the column of known data we're trying to predict finalData = transformedEcommerceData.select('Features', 'Yearly Amount Spent') finalData.show() # %% #Splitting the data into training and testing sets by randomly choosing 70% of the rows for training and 30% of the rows for testing trainingData, testingData = finalData.randomSplit([0.7, 0.3]) # %% #Random Forest Regression randomForest = RandomForestRegressor(featuresCol="Features", labelCol="Yearly Amount Spent", maxDepth=15, maxBins=32, numTrees=200) randomForestModel = randomForest.fit(trainingData) rfresults = randomForestModel.transform(testingData) rfresults.select("Prediction", "Yearly Amount Spent", "Features") rfresults.show() #Using RMSE to evaluate the model gbtevaluator = RegressionEvaluator(labelCol="Yearly Amount Spent", predictionCol="prediction", metricName="rmse") gbtrmse = gbtevaluator.evaluate(rfresults) print("Gradient-Boosted Tree RMSE: ", gbtrmse)
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit random_forest = RandomForestRegressor() param_Grd = (ParamGridBuilder().addGrid( random_forest.maxDepth, [2, 4, 6, 8]).addGrid(random_forest.maxBins, [20, 60]).addGrid(random_forest.numTrees, [5, 20, 50, 100]).build()) t_v_s = TrainValidationSplit(estimator=random_forest, estimatorParamMaps=param_Grd, evaluator=RegressionEvaluator(), trainRatio=0.8) rfModel = random_forest.fit(trainingData) # COMMAND ---------- pred = rfModel.transform(trainingData) select_cols = ["label", "prediction", "time_window"] pred = pred.select(select_cols) display(pred) # COMMAND ---------- import numpy as ny def Mean_Absolute_Percentage_Error(labl, predction): labl, predction = ny.array(labl), ny.array(predction)
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map( lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor(numTrees=4, featuresCol="features", labelCol='total_amount', maxDepth=2, seed=42) rf_model = rf.fit(dataset) rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model") import sys sys.exit(0) """ from pyspark.ml.feature import VectorAssembler #vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount') #newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"])) #vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vhouse_df = vectorAssembler.transform(train_X4) vhouse_df = vhouse_df.select(['features', 'fare_amount']) vhouse_df.show(3) from pyspark.ml.regression import RandomForestRegressor
stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor(numTrees=4,featuresCol="features",labelCol='total_amount', maxDepth=2, seed=42) rf_model = rf.fit(dataset) rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model") import sys sys.exit(0) """ from pyspark.ml.feature import VectorAssembler #vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount') #newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"])) #vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vhouse_df = vectorAssembler.transform(train_X4) vhouse_df = vhouse_df.select(['features', 'fare_amount']) vhouse_df.show(3)
# COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor() print rf.explainParams() rfModel = rf.fit(df) gbt = GBTRegressor() print gbt.explainParams() gbtModel = gbt.fit(df) # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml import Pipeline from pyspark.ml.tuning import CrossValidator, ParamGridBuilder glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity") pipeline = Pipeline().setStages([glr]) params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build() evaluator = RegressionEvaluator()\