コード例 #1
0
ファイル: pipeline.py プロジェクト: rageshn/spark
# Add grid to cross validator & get the best model
crossVal.setEstimatorParamMaps(paramGrid)
dtModel = crossVal.fit(trainSetDF).bestModel

predictionsAndLabelsDF = dtModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE")
rmseDT = regEval.evaluate(predictionsAndLabelsDF)
r2DT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

# print(rmseDT)
# print(r2DT)


# Create Random Forest
rf = RandomForestRegressor()
rf.setLabelCol("Power_Output").setPredictionCol("Predicted_PE").setFeaturesCol("features").setSeed(100088121).setMaxDepth(8).setNumTrees(30)
rfPipeline = Pipeline()

rfPipeline.setStages([vectorizer, rf])

crossVal.setEstimator(rfPipeline)

# Tune the rf.maxBins parameter on the values 50 and 100, create a parameter grid using the ParamGridBuilder
paramGrid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
crossVal.setEstimatorParamMaps(paramGrid)
rfModel = crossVal.fit(trainSetDF).bestModel

predictionsAndLabelsDF = rfModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE")
rmseRF = regEval.evaluate(predictionsAndLabelsDF)
r2RF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})
コード例 #2
0
    def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler):

        if _type == 'regression':
            if _SLA == 'randomForest':
                rf = RandomForestRegressor()
                rf.setLabelCol(self.targetVariable)\
                  .setPredictionCol("prediction")\
                  .setFeaturesCol("features")\
                  .setProbabilityCol("proba")\
                  .setSeed(100088121L)\
                  .setMaxDepth(int(self.sparkOptions[1]))\
                  .setMaxMemoryInMB(10000)\
                  .setFeatureSubsetStrategy(self.sparkOptions[5])
                self._regEval = RegressionEvaluator(
                    predictionCol="prediction",
                    labelCol=self.targetVariable,
                    metricName="rmse")

        else:  #classification
            if _SLA == 'randomForest':
                rf = RandomForestClassifier(
                    labelCol=self.targetVariable,
                    featuresCol="features",
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2],
                    probabilityCol="proba")
                if goodClass != '':
                    self.regEval = BinaryClassificationEvaluator(
                        labelCol=self.targetVariable,
                        metricName="areaUnderROC")
                else:
                    self.regEval = MulticlassClassificationEvaluator(
                        labelCol=self.targetVariable,
                        predictionCol="prediction",
                        metricName="accuracy")

        # Create a Pipeline
        self._pipeline = Pipeline()
        # Set the stages of the Pipeline #vecAssembler
        self._pipeline.setStages([vecAssembler, rf])
        # GridSearch
        self._paramGrid = (ParamGridBuilder().addGrid(
            rf.numTrees,
            [int(num) for num in self.sparkOptions[4].split(',')]).build())
        # Add the grid to the CrossValidator
        self._crossval = CrossValidator(estimator=self._pipeline,
                                        estimatorParamMaps=self._paramGrid,
                                        evaluator=self._regEval,
                                        numFolds=self.nbSamples)
        # Now let's find and return the best model
        self._dataFrameModel = self._crossval.fit(data).bestModel

        #to be removed
        #print rf.getNumTrees()
        #modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText))
        #print nbTrees
        # end TBR

        rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" +
                str(self.sparkModelsId[0]))