Python RandomForestRegressor.setLabelCol Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.regression

Method/Function: setLabelCol

Examples at hotexamples.com: 2

Python RandomForestRegressor.setLabelCol - 2 examples found. These are the top rated real world Python examples of pyspark.ml.regression.RandomForestRegressor.setLabelCol extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RandomForestRegressor(30)

fit(30)

transform(4)

getMaxDepth(4)

getNumTrees(4)

save(2)

setPredictionCol(2)

setLabelCol(2)

predict(2)

load(2)

explainParams(1)

get_params(1)

getPredictionCol(1)

setMaxDepth(1)

setNumTrees(1)

setParams(1)

getMaxBins(1)

set_params(1)

getLabelCol(1)

write(1)

Example #1

Show file

File: pipeline.py Project: rageshn/spark

# Add grid to cross validator & get the best model
crossVal.setEstimatorParamMaps(paramGrid)
dtModel = crossVal.fit(trainSetDF).bestModel

predictionsAndLabelsDF = dtModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE")
rmseDT = regEval.evaluate(predictionsAndLabelsDF)
r2DT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

# print(rmseDT)
# print(r2DT)


# Create Random Forest
rf = RandomForestRegressor()
rf.setLabelCol("Power_Output").setPredictionCol("Predicted_PE").setFeaturesCol("features").setSeed(100088121).setMaxDepth(8).setNumTrees(30)
rfPipeline = Pipeline()

rfPipeline.setStages([vectorizer, rf])

crossVal.setEstimator(rfPipeline)

# Tune the rf.maxBins parameter on the values 50 and 100, create a parameter grid using the ParamGridBuilder
paramGrid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
crossVal.setEstimatorParamMaps(paramGrid)
rfModel = crossVal.fit(trainSetDF).bestModel

predictionsAndLabelsDF = rfModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE")
rmseRF = regEval.evaluate(predictionsAndLabelsDF)
r2RF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

Example #2

Show file

    def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler):

        if _type == 'regression':
            if _SLA == 'randomForest':
                rf = RandomForestRegressor()
                rf.setLabelCol(self.targetVariable)\
                  .setPredictionCol("prediction")\
                  .setFeaturesCol("features")\
                  .setProbabilityCol("proba")\
                  .setSeed(100088121L)\
                  .setMaxDepth(int(self.sparkOptions[1]))\
                  .setMaxMemoryInMB(10000)\
                  .setFeatureSubsetStrategy(self.sparkOptions[5])
                self._regEval = RegressionEvaluator(
                    predictionCol="prediction",
                    labelCol=self.targetVariable,
                    metricName="rmse")

        else:  #classification
            if _SLA == 'randomForest':
                rf = RandomForestClassifier(
                    labelCol=self.targetVariable,
                    featuresCol="features",
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2],
                    probabilityCol="proba")
                if goodClass != '':
                    self.regEval = BinaryClassificationEvaluator(
                        labelCol=self.targetVariable,
                        metricName="areaUnderROC")
                else:
                    self.regEval = MulticlassClassificationEvaluator(
                        labelCol=self.targetVariable,
                        predictionCol="prediction",
                        metricName="accuracy")

        # Create a Pipeline
        self._pipeline = Pipeline()
        # Set the stages of the Pipeline #vecAssembler
        self._pipeline.setStages([vecAssembler, rf])
        # GridSearch
        self._paramGrid = (ParamGridBuilder().addGrid(
            rf.numTrees,
            [int(num) for num in self.sparkOptions[4].split(',')]).build())
        # Add the grid to the CrossValidator
        self._crossval = CrossValidator(estimator=self._pipeline,
                                        estimatorParamMaps=self._paramGrid,
                                        evaluator=self._regEval,
                                        numFolds=self.nbSamples)
        # Now let's find and return the best model
        self._dataFrameModel = self._crossval.fit(data).bestModel

        #to be removed
        #print rf.getNumTrees()
        #modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText))
        #print nbTrees
        # end TBR

        rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" +
                str(self.sparkModelsId[0]))