def readCross(num,type,numtrees):

    filename=resultFile+'_'+type+'_'+num+'_all.csv'
    loader=CSVLoader()
    loader.setSource(File(filename))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) 
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(data)
    output.setBuffer(buffer)
    output.setOutputDistribution(True) 
    attRange = Range()  # attributes to output
    outputDistributions = Boolean(True)
    evaluator=Evaluation(data) 
    
    evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions])
    

    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def myGridSearch(data,NTreeBounds,NFeaturesBounds):
    best_acc = -float('inf')
    bestrandomforest = None
    class bestValues(object):
        t = float('nan')
        f = float('nan')
    for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]):
        for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]):
            randomforest = RandomForest()
            randomforest.setNumTrees(int(t))
            randomforest.setNumFeatures(int(f))
            evaluation = Evaluation(data)
            output = output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestrandomforest = randomforest
                best_acc = acc
                bestValues.t = t
                bestValues.f = f
    print "Best accuracy:", best_acc
    print "Best values:  NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f
    print "-----------------------------------------"
    return bestrandomforest, bestValues.t, bestValues.f, best_acc
def readFeature(num_features,type,select_feature,numtrees):
    #filename1=resultFileTest
    #filename2=resultFileTest2
    filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv'
    filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv'
    #print filename1
    loader=CSVLoader()
    loader.setSource(File(filename1))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    
    rf.buildClassifier(data)
   
    #print rf
    loader.setSource(File(filename2))
    

    test_data=Instances(loader.getDataSet())
    
    test_data.setClassIndex(test_data.numAttributes()-1)

    
    ''' num=test_data.numInstances()

    
    print num
   
    for i in xrange(num):

        r1=rf.distributionForInstance(test_data.instance(i))
  
        r2=rf.classifyInstance(test_data.instance(i))

        ptrixrint r1 
          
           print r2'''
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(test_data)
    output.setBuffer(buffer)
    
    attRange = Range()  # attributes to output
    outputDistribution = Boolean(True)
    evaluator=Evaluation(data)
    evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution])
    #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)])
    #evaluator1=Evaluation(test_data)
    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def RandomForest_ParamFinder(data): 
    # possible set for Number of trees
    NTreeBounds = [1,20,1]
    # possible set for number of features
    NFeaturesBounds = [0,20,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        randomforest = RandomForest()
        gridsearch.setClassifier(randomforest)
        gridsearch.setXProperty(String('classifier.numTrees'))
        gridsearch.setYProperty(String('classifier.numFeatures'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        gridsearch.setXMin(NTreeBounds[0])
        gridsearch.setXMax(NTreeBounds[1])
        gridsearch.setXStep(NTreeBounds[2])
        gridsearch.setYMin(NFeaturesBounds[0])
        gridsearch.setYMax(NFeaturesBounds[1])
        gridsearch.setYStep(NFeaturesBounds[2])
        gridsearch.setYBase(10)
        print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestrandomforest = RandomForest()
        bestrandomforest.setNumTrees(int(bestValues.x))
        bestrandomforest.setNumFeatures(int(bestValues.y))
        evaluation = Evaluation(data)
        output = output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y
        OptRndFrst = bestrandomforest
        OptRndFrstp1 = bestValues.x
        OptRndFrstp2 = bestValues.y
        OptRndFrstAcc = acc
    else:
        OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) 
    Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \
            ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc)
    print "-----------------------------------------"
    return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
Beispiel #5
0
def main():
    #create the training & test sets, skipping the header row with [1:]
    dataset = genfromtxt(open('Data/train.csv','r'), delimiter=',', dtype='f8')[1:]    
    target = [x[0] for x in dataset]
    train = [x[1:] for x in dataset]
    test = genfromtxt(open('Data/test.csv','r'), delimiter=',', dtype='f8')[1:]
    
    #create and train the random forest
    #multi-core CPUs can use: rf = RandomForestClassifier(n_estimators=100, n_jobs=2)
    rf = RandomForest.setNumTrees(100)
    rf.Evaluation(train, target)

    savetxt('Data/submission2.csv', rf.predict(test), delimiter=',', fmt='%f')
def random_forest(trainData,testData,params,exparams):
    numTrees = int(float(params[0]))
    numFeatures = int(float(params[1]))
    randomforest = RandomForest()
    randomforest.setNumTrees(numTrees)
    randomforest.setNumFeatures(numFeatures)
    randomforest.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
Beispiel #7
0
    }]
    test = [{"sex": "f", "subject": "Phil"}, {"sex": "m", "subject": "CS"}]
    numericAttributes = []
    classAttr = "subject"

    tree = DecisionTree(numericAttributes)
    for i in inst:
        tree.addInstance(i)
    tree.learn(classAttr, unpruned=True, minNumObj=0)

    svm = SVM(numericAttributes)
    for i in inst:
        svm.addInstance(i)
    svm.learn(classAttr)

    ada = AdaBoost(numericAttributes)
    for i in inst:
        ada.addInstance(i)
    ada.learn(classAttr)

    forest = RandomForest(numericAttributes)
    for i in inst:
        forest.addInstance(i)
    forest.learn(classAttr)

    for j, model in enumerate((tree, svm, ada, forest)):
        print "\nmodel", j
        for i in test:
            #del i[classAttr]
            print model.classify(i)
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# define the algorithms to be used.
algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(), 'BayesNet'),
             (J48(), 'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'),
             (RandomForest(), 'RandomForest'), (AdaBoostM1(), 'AdaBoostM1'),
             (MultilayerPerceptron(), 'MultilayerPerceptron'),
             (LibSVM(), 'LibSVM')]
algo_dict = dict([(x[1], x[0]) for x in algo_list])
algo_keys = [
    'NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar',
    'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron'
]

# example to set kernal type on libsvm.  Default is 2
#algo = algo_dict['LibSVM']
#tag = SelectedTag("1",algo.TAGS_KERNELTYPE)  # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
#algo.setKernelType(tag)

# train classifiers but filter out the name column first
print "Training classifiers..."
Beispiel #9
0
		{"sex":"m", "subject":"CS"}
	]
	test = [
		{"sex":"f", "subject":"Phil"},
		{"sex":"m", "subject":"CS"}
	]
	numericAttributes=[]
	classAttr = "subject"
	
	tree = DecisionTree(numericAttributes)
	for i in inst: tree.addInstance(i)
	tree.learn(classAttr, unpruned=True, minNumObj=0)
	
	svm = SVM(numericAttributes)
	for i in inst: svm.addInstance(i)
	svm.learn(classAttr)
	
	ada = AdaBoost(numericAttributes)
	for i in inst: ada.addInstance(i)
	ada.learn(classAttr)
	
	forest = RandomForest(numericAttributes)
	for i in inst: forest.addInstance(i)
	forest.learn(classAttr)
	
	for j,model in enumerate((tree, svm, ada, forest)):
		print "\nmodel", j
		for i in test:
			#del i[classAttr]
			print model.classify(i)
	
# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# define the algorithms to be used.
algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(),'BayesNet'), (J48(),'J48'), (JRip(), 'JRip'),
                 (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(),'AdaBoostM1'),
                 (MultilayerPerceptron(),'MultilayerPerceptron'), (LibSVM(), 'LibSVM')]
algo_dict = dict([(x[1], x[0]) for x in algo_list])
algo_keys = ['NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron']

# example to set kernal type on libsvm.  Default is 2
algo = algo_dict['LibSVM']
tag = SelectedTag("1",algo.TAGS_KERNELTYPE)  # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
algo.setKernelType(tag)

# train classifiers
print "Training classifiers..."
for key in algo_keys :
   algo = algo_dict[key]
   algo.buildClassifier(data)
Beispiel #11
0
def run(basename,train_filename,test_filename,
        num_trees=100,tree_depth=0,class_index=0):

    with timer.Timer("loading data"):
        training = read_dataset(train_filename,class_index=class_index)
        testing = read_dataset(test_filename,class_index=class_index)

    """
    print "====== naive Bayes ====="
    with timer.Timer("training"):
        nb = NaiveBayes()
        nb.buildClassifier(training)
    with timer.Timer("testing"):
        eval_training = evaluate_dataset(nb,training)
        eval_testing = evaluate_dataset(nb,testing)
    print "=== evaluation (training):"
    print eval_training.toSummaryString()
    print "=== evaluation (testing):"
    print eval_testing.toSummaryString()
    """

    print "====== random forest ====="
    with timer.Timer("training"):
        rf = RandomForest()
        #rf.setOptions([
        #  u'-P', u'100', u'-I', u'100', u'-num-slots', u'1', u'-K', u'0', u'-M', u'1.0', u'-V', u'0.001', u'-S', u'1',
        #  u'-num-decimal-places', u'6'
        #])
        rf.setNumIterations(num_trees)
        if tree_depth:
            rf.setMaxDepth(tree_depth)
        rf.buildClassifier(training)
    with timer.Timer("testing"):
        eval_training = evaluate_dataset(rf,training)
        eval_testing = evaluate_dataset(rf,testing)
    print "=== evaluation (training):"
    print eval_training.toSummaryString()
    print "=== evaluation (testing):"
    print eval_testing.toSummaryString()

    #print rf.getmembers()

    num_classifiers = len(rf.m_Classifiers)
    for i,tree in enumerate(rf.m_Classifiers):
        options_arr = tree.getOptions()
        options_arr_python = [x for x in options_arr]
        options_arr_python += [u'-num-decimal-places',u'6']
        tree.setOptions(options_arr_python)
        #print tree.toString()
        #binarize(tree)
        filename = basename % i
        with open(filename,"w") as f:
            f.writelines(tree.graph())

    correct,incorrect = 0,0
    for instance in testing:
        pos,neg = 0,0
        for tree in rf.m_Classifiers:
            #print tree.classifyInstance(instance)
            if tree.classifyInstance(instance) >= 0.5:
                pos += 1
            else:
                neg += 1
            my_label = 1.0 if pos >= neg else 0.0
        if my_label == instance.classValue():
            correct += 1
        else:
            incorrect += 1
    print "    trees : %d" % num_trees
    print "--- evaluating majority vote on random forest:"
    print "  correct : %d" % correct
    print "incorrect : %d" % incorrect