def bagging(data, attrs, numTrees, defaultValue, test_data): ensemble = [] attrNames = readARFF.getAttrList(attrs) for i in range(numTrees): newData = resample(data) tree = dt.makeTree(newData, attrs, defaultValue) ensemble.append(tree) print 'Test score:' compute_score(test_data, ensemble, attrs, defaultValue) print 'Train score:' compute_score(data, ensemble, attrs, defaultValue)
def classify(self, data, attributes): if self.isLeaf(): return self.value else: listAttributes = readARFF.getAttrList(attributes) value = data[listAttributes.index(self.attribute)] #if value not in self.children: #if not self.children[value]: #if len(self.children[value]) == 0: if self.children[value].attribute is None: return self.value child = self.children[value] return child.classify(data, attributes)
def classify(self, data, attributes) : # you write this attrslist = readARFF.getAttrList(attributes) # if self.attribute: # print self.attribute if self.value: # print self.value return self.value elif data[attrslist.index(self.attribute) ] in self.children: i = attrslist.index(self.attribute) return self.children[data[i]].classify(data, attributes) else: return self.defaultValue
def classify(self, data, attributes): # you write this attrslist = readARFF.getAttrList(attributes) # if self.attribute: # print self.attribute if self.value: # print self.value return self.value elif data[attrslist.index(self.attribute)] in self.children: i = attrslist.index(self.attribute) return self.children[data[i]].classify(data, attributes) else: return self.defaultValue
def evaluation(nfold, attrs, data): train_precision = [] train_recall = [] train_accuracy = [] test_precision = [] test_recall = [] test_accuracy = [] for k in range(nfold): random.seed() random.shuffle(data) traindata = data[:len(data) / 5 * 4] # print len(traindata) testdata = data[len(data) / 5 * 4:] # print len(testdata) # print data[:len(data)/10] attrslist = readARFF.getAttrList(attrs) root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data)) # print '####fold####',k # root.printTree() precision, recall, correct_num = calc_precision_recall_accuracy( root, attrs, testdata) test_precision.append(precision) test_recall.append(recall) test_accuracy.append(float(correct_num) / len(testdata)) precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy( root, attrs, traindata) train_precision.append(precision_train) train_recall.append(recall_train) train_accuracy.append(float(correct_num_train) / len(traindata)) test_precision_average, test_recall_average, test_accuracy_average = calc_average( nfold, test_precision, test_recall, test_accuracy) train_precision_average, train_recall_average, train_accuracy_average = calc_average( nfold, train_precision, train_recall, train_accuracy) print '##### The performance of decision tree #####' print ' test_precision:' print_pr_re(test_precision_average) print ' test_recall:' print_pr_re(test_recall_average) print ' test_accuracy: %f%%' % (test_accuracy_average * 100) print print ' training_precision:' print_pr_re(train_precision_average) print ' training_recall:' print_pr_re(train_recall_average) print ' training_accuracy: %f%%' % (train_accuracy_average * 100)
def evaluation(nfold , attrs, data): train_precision = [] train_recall = [] train_accuracy = [] test_precision = [] test_recall = [] test_accuracy = [] for k in range(nfold): random.seed() random.shuffle(data) traindata = data[:len(data)/5*4] # print len(traindata) testdata = data[len(data)/5*4:] # print len(testdata) # print data[:len(data)/10] attrslist = readARFF.getAttrList(attrs) root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data)) # print '####fold####',k # root.printTree() precision, recall, correct_num = calc_precision_recall_accuracy(root, attrs, testdata) test_precision.append(precision) test_recall.append(recall) test_accuracy.append(float(correct_num) / len(testdata)) precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(root, attrs, traindata) train_precision.append(precision_train) train_recall.append(recall_train) train_accuracy.append(float(correct_num_train) / len(traindata)) test_precision_average, test_recall_average, test_accuracy_average = calc_average(nfold, test_precision, test_recall, test_accuracy) train_precision_average, train_recall_average, train_accuracy_average = calc_average(nfold, train_precision, train_recall, train_accuracy) print '##### The performance of decision tree #####' print ' test_precision:' print_pr_re(test_precision_average) print ' test_recall:' print_pr_re(test_recall_average) print ' test_accuracy: %f%%' % (test_accuracy_average*100) print print ' training_precision:' print_pr_re(train_precision_average) print ' training_recall:' print_pr_re(train_recall_average) print ' training_accuracy: %f%%' % (train_accuracy_average*100)
def calculateForMultiClass(trainingData, testingData, attributes, defaultValue) : alist = readARFF.getAttrList(attributes) root = makeTree(trainingData, alist, attributes, defaultValue) classes = set([data[-1] for data in trainingData]) Precision = {} Recall = {} correctPredicted = {} timesPredicted = {} exampleLabeled = {} for item in classes : correctPredicted[item] = 0 timesPredicted[item] = 0 exampleLabeled[item] = 0 for element in testingData : classValue = element[-1] result = root.classify(element[0:-1], attributes) #print result timesPredicted[result] += 1 exampleLabeled[classValue] += 1 if result == defaultValue: continue if result == classValue : correctPredicted[result] += 1 allCorrect = 0 for item in classes : #print item #print correctPredicted[item] if correctPredicted[item] == 0: Precision[item] = 0 Recall[item] = 0 else: Precision[item] = float(correctPredicted[item]) / float(timesPredicted[item]) Recall[item] = float(correctPredicted[item]) / float(exampleLabeled[item]) allCorrect += correctPredicted[item] accuracy = float(allCorrect) / len(testingData) print "For multi-class -----------" print "Precision: ", Precision print "Recall: ", Recall print "Accuracy: ", accuracy
def makeTree(dataSet, aList, attributes, defaultValue): if entropy([d[-1] for d in dataSet]) == 0: return TreeNode(None, dataSet[0][-1]) elif len(aList) == 0: return TreeNode(None, defaultValue) else: listAttributes = readARFF.getAttrList(attributes) for index, item in enumerate(listAttributes): if item not in aList: listAttributes[index] = None attribute = selectAttribute(dataSet, listAttributes) index = listAttributes.index(attribute) possibleValue = attributes[index][attribute] aList.remove(attribute) node = TreeNode(attribute, None) for value in possibleValue: subSet = [d for d in dataSet if d[index] == value] if len(subSet) == 0: node.children[value] = TreeNode(None, readARFF.computeZeroR(attributes, dataSet)) else: node.children[value] = makeTree(subSet, aList, attributes, readARFF.computeZeroR(attributes, subSet)) return node
def evaluation(trainingData, testingData, attributes, defaultValue) : #tp = 'yes' #tn = 'no' accuracy = 0.0 FP = 0 FN = 0 TP = 0 TN = 0 root = makeTree(trainingData, readARFF.getAttrList(attributes), attributes, defaultValue) for item in testingData : classValue = item[-1] result = root.classify(item[0:-1], attributes) if result == defaultValue: continue elif result == classValue: if isPositive(result): TP +=1 else: TN +=1 else: if isPositive(result): FP +=1 else: FN +=1 if TP == 0 : precision = 0 recall = 0 elif (TP + TN) == 0: accuracy = 0 else : precision = float(TP)/float((TP + FP)) recall = float(TP)/float((TP + FN)) accuracy = float((TP + TN))/float((TP + FP + FN + TN)) #print precision, recall, accuracy return precision, recall, accuracy
if correctPredicted[item] == 0: Precision[item] = 0 Recall[item] = 0 else: Precision[item] = float(correctPredicted[item]) / float(timesPredicted[item]) Recall[item] = float(correctPredicted[item]) / float(exampleLabeled[item]) allCorrect += correctPredicted[item] accuracy = float(allCorrect) / len(testingData) print "For multi-class -----------" print "Precision: ", Precision print "Recall: ", Recall print "Accuracy: ", accuracy ### This part need be modified for different dataset def isPositive(result) : return "no-recurrence-events" in result if __name__ == '__main__' : filename = 'nursery.arff' attributes = readARFF.readArff(open(filename))[0] data = readARFF.readArff(open(filename))[1] alist = readARFF.getAttrList(attributes) trainingData, testingData = createTrainAndTestData(data) #runEvaluation5times(data, attributes, None) #runZeroREvaluation5times(data, attributes, None) calculateForMultiClass(trainingData, testingData, attributes, None)