def myGridSearch(data,NTreeBounds,NFeaturesBounds): best_acc = -float('inf') bestrandomforest = None class bestValues(object): t = float('nan') f = float('nan') for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]): for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]): randomforest = RandomForest() randomforest.setNumTrees(int(t)) randomforest.setNumFeatures(int(f)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestrandomforest = randomforest best_acc = acc bestValues.t = t bestValues.f = f print "Best accuracy:", best_acc print "Best values: NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f print "-----------------------------------------" return bestrandomforest, bestValues.t, bestValues.f, best_acc
def RandomForest_ParamFinder(data): # possible set for Number of trees NTreeBounds = [1,20,1] # possible set for number of features NFeaturesBounds = [0,20,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) randomforest = RandomForest() gridsearch.setClassifier(randomforest) gridsearch.setXProperty(String('classifier.numTrees')) gridsearch.setYProperty(String('classifier.numFeatures')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(NTreeBounds[0]) gridsearch.setXMax(NTreeBounds[1]) gridsearch.setXStep(NTreeBounds[2]) gridsearch.setYMin(NFeaturesBounds[0]) gridsearch.setYMax(NFeaturesBounds[1]) gridsearch.setYStep(NFeaturesBounds[2]) gridsearch.setYBase(10) print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestrandomforest = RandomForest() bestrandomforest.setNumTrees(int(bestValues.x)) bestrandomforest.setNumFeatures(int(bestValues.y)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y OptRndFrst = bestrandomforest OptRndFrstp1 = bestValues.x OptRndFrstp2 = bestValues.y OptRndFrstAcc = acc else: OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \ ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc) print "-----------------------------------------" return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
def random_forest(trainData,testData,params,exparams): numTrees = int(float(params[0])) numFeatures = int(float(params[1])) randomforest = RandomForest() randomforest.setNumTrees(numTrees) randomforest.setNumFeatures(numFeatures) randomforest.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary