def readCross(num,type,numtrees):

    filename=resultFile+'_'+type+'_'+num+'_all.csv'
    loader=CSVLoader()
    loader.setSource(File(filename))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) 
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(data)
    output.setBuffer(buffer)
    output.setOutputDistribution(True) 
    attRange = Range()  # attributes to output
    outputDistributions = Boolean(True)
    evaluator=Evaluation(data) 
    
    evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions])
    

    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def myGridSearch(data,NTreeBounds,NFeaturesBounds):
    best_acc = -float('inf')
    bestrandomforest = None
    class bestValues(object):
        t = float('nan')
        f = float('nan')
    for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]):
        for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]):
            randomforest = RandomForest()
            randomforest.setNumTrees(int(t))
            randomforest.setNumFeatures(int(f))
            evaluation = Evaluation(data)
            output = output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestrandomforest = randomforest
                best_acc = acc
                bestValues.t = t
                bestValues.f = f
    print "Best accuracy:", best_acc
    print "Best values:  NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f
    print "-----------------------------------------"
    return bestrandomforest, bestValues.t, bestValues.f, best_acc
def readFeature(num_features,type,select_feature,numtrees):
    #filename1=resultFileTest
    #filename2=resultFileTest2
    filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv'
    filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv'
    #print filename1
    loader=CSVLoader()
    loader.setSource(File(filename1))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    
    rf.buildClassifier(data)
   
    #print rf
    loader.setSource(File(filename2))
    

    test_data=Instances(loader.getDataSet())
    
    test_data.setClassIndex(test_data.numAttributes()-1)

    
    ''' num=test_data.numInstances()

    
    print num
   
    for i in xrange(num):

        r1=rf.distributionForInstance(test_data.instance(i))
  
        r2=rf.classifyInstance(test_data.instance(i))

        ptrixrint r1 
          
           print r2'''
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(test_data)
    output.setBuffer(buffer)
    
    attRange = Range()  # attributes to output
    outputDistribution = Boolean(True)
    evaluator=Evaluation(data)
    evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution])
    #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)])
    #evaluator1=Evaluation(test_data)
    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def RandomForest_ParamFinder(data): 
    # possible set for Number of trees
    NTreeBounds = [1,20,1]
    # possible set for number of features
    NFeaturesBounds = [0,20,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        randomforest = RandomForest()
        gridsearch.setClassifier(randomforest)
        gridsearch.setXProperty(String('classifier.numTrees'))
        gridsearch.setYProperty(String('classifier.numFeatures'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        gridsearch.setXMin(NTreeBounds[0])
        gridsearch.setXMax(NTreeBounds[1])
        gridsearch.setXStep(NTreeBounds[2])
        gridsearch.setYMin(NFeaturesBounds[0])
        gridsearch.setYMax(NFeaturesBounds[1])
        gridsearch.setYStep(NFeaturesBounds[2])
        gridsearch.setYBase(10)
        print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestrandomforest = RandomForest()
        bestrandomforest.setNumTrees(int(bestValues.x))
        bestrandomforest.setNumFeatures(int(bestValues.y))
        evaluation = Evaluation(data)
        output = output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y
        OptRndFrst = bestrandomforest
        OptRndFrstp1 = bestValues.x
        OptRndFrstp2 = bestValues.y
        OptRndFrstAcc = acc
    else:
        OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) 
    Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \
            ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc)
    print "-----------------------------------------"
    return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
Beispiel #5
0
def main():
    #create the training & test sets, skipping the header row with [1:]
    dataset = genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:]    
    target = [x[0] for x in dataset]
    train = [x[1:] for x in dataset]
    test = genfromtxt(open('Data/test.csv','r'), delimiter=',', dtype='f8')[1:]
    
    #create and train the random forest
    #multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
    rf = RandomForest.setNumTrees(100)
    rf.Evaluation(train, target)

    savetxt('Data/submission2.csv', rf.predict(test), delimiter=',', fmt='%f')
def random_forest(trainData,testData,params,exparams):
    numTrees = int(float(params[0]))
    numFeatures = int(float(params[1]))
    randomforest = RandomForest()
    randomforest.setNumTrees(numTrees)
    randomforest.setNumFeatures(numFeatures)
    randomforest.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary