Example #1
0
def bagging(data, attrs, numTrees, defaultValue, test_data):
    ensemble = []
    attrNames = readARFF.getAttrList(attrs)
    for i in range(numTrees):
        newData = resample(data)
        tree = dt.makeTree(newData, attrs, defaultValue)
        ensemble.append(tree)
    print 'Test score:'
    compute_score(test_data, ensemble, attrs, defaultValue)
    print 'Train score:'
    compute_score(data, ensemble, attrs, defaultValue)
Example #2
0
 def classify(self, data, attributes):
     if self.isLeaf():
         return self.value
     else:
         listAttributes = readARFF.getAttrList(attributes)
         value = data[listAttributes.index(self.attribute)]
         #if value not in self.children:
         #if not self.children[value]:
         #if len(self.children[value]) == 0:
         if self.children[value].attribute is None:
             return self.value
         child = self.children[value]
         return child.classify(data, attributes)
 def classify(self, data, attributes) :
     # you write this
     attrslist = readARFF.getAttrList(attributes)
     # if self.attribute:
     #     print self.attribute
     if self.value:
         # print self.value
         return self.value
     elif data[attrslist.index(self.attribute) ] in self.children:
         i = attrslist.index(self.attribute)
         return self.children[data[i]].classify(data, attributes)
     else:
         return self.defaultValue
Example #4
0
 def classify(self, data, attributes):
     # you write this
     attrslist = readARFF.getAttrList(attributes)
     # if self.attribute:
     #     print self.attribute
     if self.value:
         # print self.value
         return self.value
     elif data[attrslist.index(self.attribute)] in self.children:
         i = attrslist.index(self.attribute)
         return self.children[data[i]].classify(data, attributes)
     else:
         return self.defaultValue
Example #5
0
def evaluation(nfold, attrs, data):
    train_precision = []
    train_recall = []
    train_accuracy = []

    test_precision = []
    test_recall = []
    test_accuracy = []

    for k in range(nfold):
        random.seed()
        random.shuffle(data)
        traindata = data[:len(data) / 5 * 4]
        # print len(traindata)
        testdata = data[len(data) / 5 * 4:]
        # print len(testdata)
        # print data[:len(data)/10]
        attrslist = readARFF.getAttrList(attrs)
        root = makeTree(traindata, attrslist, attrs,
                        readARFF.computeZeroR(data))
        # print '####fold####',k
        # root.printTree()

        precision, recall, correct_num = calc_precision_recall_accuracy(
            root, attrs, testdata)
        test_precision.append(precision)
        test_recall.append(recall)
        test_accuracy.append(float(correct_num) / len(testdata))

        precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(
            root, attrs, traindata)
        train_precision.append(precision_train)
        train_recall.append(recall_train)
        train_accuracy.append(float(correct_num_train) / len(traindata))

    test_precision_average, test_recall_average, test_accuracy_average = calc_average(
        nfold, test_precision, test_recall, test_accuracy)
    train_precision_average, train_recall_average, train_accuracy_average = calc_average(
        nfold, train_precision, train_recall, train_accuracy)
    print '#####   The performance of decision tree   #####'
    print '     test_precision:'
    print_pr_re(test_precision_average)
    print '     test_recall:'
    print_pr_re(test_recall_average)
    print '     test_accuracy: %f%%' % (test_accuracy_average * 100)
    print
    print '     training_precision:'
    print_pr_re(train_precision_average)
    print '     training_recall:'
    print_pr_re(train_recall_average)
    print '     training_accuracy: %f%%' % (train_accuracy_average * 100)
def evaluation(nfold , attrs, data):
    train_precision = []
    train_recall = []
    train_accuracy = []

    test_precision = []
    test_recall = []
    test_accuracy = []

    for k in range(nfold):
        random.seed()
        random.shuffle(data)
        traindata = data[:len(data)/5*4]
        # print len(traindata)
        testdata = data[len(data)/5*4:]
        # print len(testdata)
        # print data[:len(data)/10]
        attrslist = readARFF.getAttrList(attrs)
        root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data))
        # print '####fold####',k
        # root.printTree()

        precision, recall, correct_num = calc_precision_recall_accuracy(root, attrs, testdata)
        test_precision.append(precision)
        test_recall.append(recall)
        test_accuracy.append(float(correct_num) / len(testdata))

        precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(root, attrs, traindata)
        train_precision.append(precision_train)
        train_recall.append(recall_train)
        train_accuracy.append(float(correct_num_train) / len(traindata))

    test_precision_average, test_recall_average, test_accuracy_average = calc_average(nfold, test_precision, test_recall, test_accuracy)
    train_precision_average, train_recall_average, train_accuracy_average = calc_average(nfold, train_precision, train_recall, train_accuracy)
    print '#####   The performance of decision tree   #####'
    print '     test_precision:' 
    print_pr_re(test_precision_average)
    print '     test_recall:' 
    print_pr_re(test_recall_average)
    print '     test_accuracy: %f%%' % (test_accuracy_average*100)
    print 
    print '     training_precision:'
    print_pr_re(train_precision_average)
    print '     training_recall:'
    print_pr_re(train_recall_average)
    print '     training_accuracy: %f%%' % (train_accuracy_average*100)
Example #7
0
def calculateForMultiClass(trainingData, testingData, attributes, defaultValue) :
    alist = readARFF.getAttrList(attributes)
    root = makeTree(trainingData, alist, attributes, defaultValue)
    classes = set([data[-1] for data in trainingData])

    Precision = {}
    Recall = {}
    correctPredicted = {}
    timesPredicted = {}
    exampleLabeled = {}

    for item in classes :
        correctPredicted[item] = 0
        timesPredicted[item] = 0
        exampleLabeled[item] = 0

    for element in testingData :
        classValue = element[-1]
        result = root.classify(element[0:-1], attributes)
        #print result
        timesPredicted[result] += 1
        exampleLabeled[classValue] += 1

        if result == defaultValue:
            continue
        if result == classValue :
            correctPredicted[result] += 1

    allCorrect = 0
    for item in classes :
        #print item
        #print correctPredicted[item]
        if correctPredicted[item] == 0:
            Precision[item] = 0
            Recall[item] = 0
        else:
            Precision[item] = float(correctPredicted[item]) / float(timesPredicted[item])
            Recall[item] = float(correctPredicted[item]) / float(exampleLabeled[item])
        allCorrect += correctPredicted[item]

    accuracy = float(allCorrect) / len(testingData)

    print "For multi-class -----------"
    print "Precision: ", Precision
    print "Recall: ", Recall
    print "Accuracy: ", accuracy
Example #8
0
def makeTree(dataSet, aList, attributes, defaultValue):
    if entropy([d[-1] for d in dataSet]) == 0:
        return TreeNode(None, dataSet[0][-1])
    elif len(aList) == 0:
        return TreeNode(None, defaultValue)
    else:
        listAttributes = readARFF.getAttrList(attributes)
        for index, item in enumerate(listAttributes):
            if item not in aList:
                listAttributes[index] = None
        attribute = selectAttribute(dataSet, listAttributes)
        index = listAttributes.index(attribute)
        possibleValue = attributes[index][attribute]
        aList.remove(attribute)
        node = TreeNode(attribute, None)
        for value in possibleValue:
            subSet = [d for d in dataSet if d[index] == value]
            if len(subSet) == 0:
                node.children[value] = TreeNode(None, readARFF.computeZeroR(attributes, dataSet))
            else:
                node.children[value] = makeTree(subSet, aList, attributes, readARFF.computeZeroR(attributes, subSet))
        return node
Example #9
0
def evaluation(trainingData, testingData, attributes, defaultValue) :
    #tp = 'yes'
    #tn = 'no'
    accuracy = 0.0
    FP = 0
    FN = 0
    TP = 0
    TN = 0
    root = makeTree(trainingData, readARFF.getAttrList(attributes), attributes, defaultValue)

    for item in testingData :
        classValue = item[-1]
        result = root.classify(item[0:-1], attributes)

        if result == defaultValue:
            continue
        elif result == classValue:
            if isPositive(result):
                TP +=1
            else:
                TN +=1
        else:
            if isPositive(result):
                FP +=1
            else:
                FN +=1
    if TP == 0 :
        precision = 0
        recall = 0
    elif (TP + TN) == 0:
        accuracy = 0
    else :
        precision = float(TP)/float((TP + FP))
        recall = float(TP)/float((TP + FN))
        accuracy = float((TP + TN))/float((TP + FP + FN + TN))

    #print precision, recall, accuracy
    return precision, recall, accuracy
Example #10
0
        if correctPredicted[item] == 0:
            Precision[item] = 0
            Recall[item] = 0
        else:
            Precision[item] = float(correctPredicted[item]) / float(timesPredicted[item])
            Recall[item] = float(correctPredicted[item]) / float(exampleLabeled[item])
        allCorrect += correctPredicted[item]

    accuracy = float(allCorrect) / len(testingData)

    print "For multi-class -----------"
    print "Precision: ", Precision
    print "Recall: ", Recall
    print "Accuracy: ", accuracy

### This part need be modified for different dataset
def isPositive(result) :
    return "no-recurrence-events" in result

if __name__ == '__main__' :
    filename = 'nursery.arff'

    attributes = readARFF.readArff(open(filename))[0]
    data = readARFF.readArff(open(filename))[1]
    alist = readARFF.getAttrList(attributes)
    trainingData, testingData = createTrainAndTestData(data)

    #runEvaluation5times(data, attributes, None)
    #runZeroREvaluation5times(data, attributes, None)

    calculateForMultiClass(trainingData, testingData, attributes, None)