# Add grid to cross validator & get the best model crossVal.setEstimatorParamMaps(paramGrid) dtModel = crossVal.fit(trainSetDF).bestModel predictionsAndLabelsDF = dtModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE") rmseDT = regEval.evaluate(predictionsAndLabelsDF) r2DT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"}) # print(rmseDT) # print(r2DT) # Create Random Forest rf = RandomForestRegressor() rf.setLabelCol("Power_Output").setPredictionCol("Predicted_PE").setFeaturesCol("features").setSeed(100088121).setMaxDepth(8).setNumTrees(30) rfPipeline = Pipeline() rfPipeline.setStages([vectorizer, rf]) crossVal.setEstimator(rfPipeline) # Tune the rf.maxBins parameter on the values 50 and 100, create a parameter grid using the ParamGridBuilder paramGrid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build() crossVal.setEstimatorParamMaps(paramGrid) rfModel = crossVal.fit(trainSetDF).bestModel predictionsAndLabelsDF = rfModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE") rmseRF = regEval.evaluate(predictionsAndLabelsDF) r2RF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})
def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler): if _type == 'regression': if _SLA == 'randomForest': rf = RandomForestRegressor() rf.setLabelCol(self.targetVariable)\ .setPredictionCol("prediction")\ .setFeaturesCol("features")\ .setProbabilityCol("proba")\ .setSeed(100088121L)\ .setMaxDepth(int(self.sparkOptions[1]))\ .setMaxMemoryInMB(10000)\ .setFeatureSubsetStrategy(self.sparkOptions[5]) self._regEval = RegressionEvaluator( predictionCol="prediction", labelCol=self.targetVariable, metricName="rmse") else: #classification if _SLA == 'randomForest': rf = RandomForestClassifier( labelCol=self.targetVariable, featuresCol="features", maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2], probabilityCol="proba") if goodClass != '': self.regEval = BinaryClassificationEvaluator( labelCol=self.targetVariable, metricName="areaUnderROC") else: self.regEval = MulticlassClassificationEvaluator( labelCol=self.targetVariable, predictionCol="prediction", metricName="accuracy") # Create a Pipeline self._pipeline = Pipeline() # Set the stages of the Pipeline #vecAssembler self._pipeline.setStages([vecAssembler, rf]) # GridSearch self._paramGrid = (ParamGridBuilder().addGrid( rf.numTrees, [int(num) for num in self.sparkOptions[4].split(',')]).build()) # Add the grid to the CrossValidator self._crossval = CrossValidator(estimator=self._pipeline, estimatorParamMaps=self._paramGrid, evaluator=self._regEval, numFolds=self.nbSamples) # Now let's find and return the best model self._dataFrameModel = self._crossval.fit(data).bestModel #to be removed #print rf.getNumTrees() #modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) #print nbTrees # end TBR rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" + str(self.sparkModelsId[0]))