def makeTree(dataset, attributes, defaultValue) : # you write; See assignment & notes for description of algorithm if len(dataset) == 0: return TreeNode(None,defaultValue) #calculate entropy for whole dataset entropyD = entropy([item[-1] for item in dataset]) if entropyD == 0: return TreeNode(None,dataset[0][-1]) if len(attributes) == 0: return TreeNode(None, readARFF.computeZeroR(attributes, dataset)) copyAttr = copy.copy(attributes) dV = readARFF.computeZeroR(attributes,dataset) attrSpread = selectAttribute(dataset,attributes) # index vlist = attributes[attrSpread].values()[0] del copyAttr[attrSpread] node = TreeNode(attributes[attrSpread].keys()[0],None) for v in vlist: ## for each value of that removed attribute ## get tuples for a specific value of that removed attribute subDataset = [item for item in dataset if item[attrSpread] == v] if len(subDataset) == 0: node.children[v] = TreeNode(None, readARFF.computeZeroR(attributes,dV)) node.children[v] = makeTree(subDataset,copyAttr,dV) return node
def makeTree(dataset, alist, attributes, defaultValue) : if len(dataset) == 0: leafNode = TreeNode(None, defaultValue) leafNode.children = {} return leafNode elif len(alist) == 0 : leafNode = TreeNode(None, readARFF.computeZeroR(attributes, dataset)) leafNode.children = {} return leafNode elif entropy([data[-1] for data in dataset]) == 0.0: leafNode = TreeNode(None, dataset[0][-1]) leafNode.children = {} return leafNode else : selectAttr = selectAttribute(dataset, alist) rootAttr = TreeNode(alist[selectAttr], None) rootAttr.children = {} #possibleValues = [] for tempIndex in range(len(attributes)) : if attributes[tempIndex].keys() == [alist[selectAttr]] : index = [tempIndex][0] possibleValues = attributes[index][alist[selectAttr]] defaultValue = readARFF.computeZeroR(attributes, dataset) for val in possibleValues : subSet = createSubSet(val, dataset, selectAttr) rootAttr.children.update({val: makeTree(subSet, alist[:selectAttr]+alist[selectAttr+1:], attributes, defaultValue)}) return rootAttr
def evalZeroR(trainDataset,testDataset,classification,attrs): classification = classification.values()[0] evalResult = {} zeroR = readARFF.computeZeroR(attrs,trainDataset) for c in classification: TPCount = 0 TNCount = 0 FPCount = 0 FNCount = 0 if zeroR == c: for i in testDataset: if i[-1] == zeroR: TPCount += 1 else: FPCount += 1 else: for i in testDataset: if i[-1] == zeroR: TNCount += 1 else: FNCount += 1 p = computePrecision(TPCount,FPCount,TNCount,FNCount) r = computeRecall(TPCount,FPCount,TNCount,FNCount) a = computeAccuracy(TPCount,FPCount,TNCount,FNCount) evalResult[c] = (p,r,a) drawChart(evalResult) return evalResult
def makeTree(dataset, alist, attributes, defaultValue): # you write; See assignment & notes for description of algorithm # if the dataset is empty if len(dataset) == 0: # print defaultValue return TreeNode(None, defaultValue) # if the dataset contains zero entropy, that is, all classes are the same. # that is, the entropy is zero elif entropy([item[-1] for item in dataset]) == 0: return TreeNode(None, dataset[0][-1]) elif len(alist) == 0: return TreeNode(None, readARFF.computeZeroR(dataset)) else: i = selectAttribute(dataset, alist) # print alist,alist[i], i # if alist[i] == 'age': # print dataset # print [item[-1] for item in dataset].count(dataset[0][-1]) , len(dataset) # print [item[-1] for item in dataset].count(dataset[0][-1]) == len(dataset) current_Treenode = TreeNode(alist[i], None) current_Treenode.defaultValue = readARFF.computeZeroR(dataset) # print i # print dataset real_index = [ j for j in range(len(attributes)) if attributes[j].keys() == [alist[i]] ][0] # print dataset,attributes[real_index].keys()[0] del_attribute = attributes[real_index][alist[i]] for item in del_attribute: sub_data = [ data[:i] + data[i + 1:] for data in dataset if data[i] == item ] # print [data[i] for data in dataset],item # print [data[i] for data in dataset].count(item) # print data[i][0] == item # print 'sub_data', sub_data, item # print sub_data,item # for eachdata in sub_data: # del eachdata[i] current_Treenode.children[item] = makeTree( sub_data, alist[:i] + alist[i + 1:], attributes, current_Treenode.defaultValue) return current_Treenode
def makeTree(dataset, attributes, defaultValue): # def makeTree(dataset) : # if data is empty: # return a single Node with defaultValue # if there are no attributes left to test: # return a single Node with majority classification # if all data are in the same class : # return a single Node with that classification # if our dataset has entropy 0: # return a single Node with that classification # else : # select the attribute that produces the largest information gain # split the dataset according to the values of this attribute to create v smaller datasets. # create a new Node - each child will be created by calling makeTree with one on the v subsets. node = TreeNode(None, None) if len(dataset) == 0: node.value = defaultValue return node if len(attributes) == 0: classification = readARFF.computeZeroR(attributes, dataset) node.value = classification return node if entropy([d[-1] for d in dataset]) == 0: node.value = dataset[0][-1] return node selectIndex = selectAttribute(dataset, attributes) key = attributes[selectIndex].keys()[0] values = attributes[selectIndex][key] node.attribute = key next_attributes = attributes.copy() del next_attributes[selectIndex] for v in values: childrenList = [d for d in dataset if d[selectIndex] == v] defaultValue = readARFF.computeZeroR(attributes, dataset) node.children[v] = makeTree(childrenList, next_attributes, defaultValue) return node
def makeTree(dataSet, aList, attributes, defaultValue): if entropy([d[-1] for d in dataSet]) == 0: return TreeNode(None, dataSet[0][-1]) elif len(aList) == 0: return TreeNode(None, defaultValue) else: listAttributes = readARFF.getAttrList(attributes) for index, item in enumerate(listAttributes): if item not in aList: listAttributes[index] = None attribute = selectAttribute(dataSet, listAttributes) index = listAttributes.index(attribute) possibleValue = attributes[index][attribute] aList.remove(attribute) node = TreeNode(attribute, None) for value in possibleValue: subSet = [d for d in dataSet if d[index] == value] if len(subSet) == 0: node.children[value] = TreeNode(None, readARFF.computeZeroR(attributes, dataSet)) else: node.children[value] = makeTree(subSet, aList, attributes, readARFF.computeZeroR(attributes, subSet)) return node
def evaluation(nfold, attrs, data): train_precision = [] train_recall = [] train_accuracy = [] test_precision = [] test_recall = [] test_accuracy = [] for k in range(nfold): random.seed() random.shuffle(data) traindata = data[:len(data) / 5 * 4] # print len(traindata) testdata = data[len(data) / 5 * 4:] # print len(testdata) # print data[:len(data)/10] attrslist = readARFF.getAttrList(attrs) root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data)) # print '####fold####',k # root.printTree() precision, recall, correct_num = calc_precision_recall_accuracy( root, attrs, testdata) test_precision.append(precision) test_recall.append(recall) test_accuracy.append(float(correct_num) / len(testdata)) precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy( root, attrs, traindata) train_precision.append(precision_train) train_recall.append(recall_train) train_accuracy.append(float(correct_num_train) / len(traindata)) test_precision_average, test_recall_average, test_accuracy_average = calc_average( nfold, test_precision, test_recall, test_accuracy) train_precision_average, train_recall_average, train_accuracy_average = calc_average( nfold, train_precision, train_recall, train_accuracy) print '##### The performance of decision tree #####' print ' test_precision:' print_pr_re(test_precision_average) print ' test_recall:' print_pr_re(test_recall_average) print ' test_accuracy: %f%%' % (test_accuracy_average * 100) print print ' training_precision:' print_pr_re(train_precision_average) print ' training_recall:' print_pr_re(train_recall_average) print ' training_accuracy: %f%%' % (train_accuracy_average * 100)
def makeTree(dataset, alist, attributes, defaultValue) : # you write; See assignment & notes for description of algorithm # if the dataset is empty if len(dataset) == 0: # print defaultValue return TreeNode(None, defaultValue) # if the dataset contains zero entropy, that is, all classes are the same. # that is, the entropy is zero elif entropy([item[-1] for item in dataset]) == 0: return TreeNode(None, dataset[0][-1]) elif len(alist) == 0: return TreeNode(None, readARFF.computeZeroR(dataset)) else: i = selectAttribute(dataset, alist) # print alist,alist[i], i # if alist[i] == 'age': # print dataset # print [item[-1] for item in dataset].count(dataset[0][-1]) , len(dataset) # print [item[-1] for item in dataset].count(dataset[0][-1]) == len(dataset) current_Treenode = TreeNode(alist[i], None) current_Treenode.defaultValue = readARFF.computeZeroR(dataset) # print i # print dataset real_index = [j for j in range(len(attributes)) if attributes[j].keys() == [alist[i]]][0] # print dataset,attributes[real_index].keys()[0] del_attribute = attributes[real_index][alist[i]] for item in del_attribute: sub_data = [data[:i]+data[i+1:] for data in dataset if data[i] == item] # print [data[i] for data in dataset],item # print [data[i] for data in dataset].count(item) # print data[i][0] == item # print 'sub_data', sub_data, item # print sub_data,item # for eachdata in sub_data: # del eachdata[i] current_Treenode.children[item] = makeTree(sub_data, alist[:i]+alist[i+1:], attributes, current_Treenode.defaultValue) return current_Treenode
def evaluation(nfold , attrs, data): train_precision = [] train_recall = [] train_accuracy = [] test_precision = [] test_recall = [] test_accuracy = [] for k in range(nfold): random.seed() random.shuffle(data) traindata = data[:len(data)/5*4] # print len(traindata) testdata = data[len(data)/5*4:] # print len(testdata) # print data[:len(data)/10] attrslist = readARFF.getAttrList(attrs) root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data)) # print '####fold####',k # root.printTree() precision, recall, correct_num = calc_precision_recall_accuracy(root, attrs, testdata) test_precision.append(precision) test_recall.append(recall) test_accuracy.append(float(correct_num) / len(testdata)) precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(root, attrs, traindata) train_precision.append(precision_train) train_recall.append(recall_train) train_accuracy.append(float(correct_num_train) / len(traindata)) test_precision_average, test_recall_average, test_accuracy_average = calc_average(nfold, test_precision, test_recall, test_accuracy) train_precision_average, train_recall_average, train_accuracy_average = calc_average(nfold, train_precision, train_recall, train_accuracy) print '##### The performance of decision tree #####' print ' test_precision:' print_pr_re(test_precision_average) print ' test_recall:' print_pr_re(test_recall_average) print ' test_accuracy: %f%%' % (test_accuracy_average*100) print print ' training_precision:' print_pr_re(train_precision_average) print ' training_recall:' print_pr_re(train_recall_average) print ' training_accuracy: %f%%' % (train_accuracy_average*100)
def calc_precision_recall_accuracy_zeroR(testdata): actual_class = {} predicted_class = {} correct_class = {} majority = readARFF.computeZeroR(testdata) for i in testdata: a = majority if i[-1] in actual_class: actual_class[i[-1]] += 1 else: actual_class[i[-1]] = 1 if a in predicted_class: predicted_class[a] += 1 else: predicted_class[a] = 1 if a == i[-1]: if i[-1] in correct_class: correct_class[i[-1]] += 1 else: correct_class[i[-1]] = 1 # print 'actual_class %s' %actual_class # print 'predicted_class %s' %predicted_class # print 'correct_class %s' %correct_class correct_num = 0 precision = {} recall = {} for j in correct_class: recall[j] = float(correct_class[j]) / actual_class[j] precision[j] = float(correct_class[j]) / predicted_class[j] correct_num += correct_class[j] for k in actual_class: if k not in precision: precision[k] = 0 if k not in recall: recall[k] = 0 return precision, recall, correct_num
def evaluationWithZeroR(trainingData, testingData, attributes, defaultValue) : accuracy = 0.0 FP = 0 FN = 0 TP = 0 TN = 0 #root = makeTree(trainingData, readARFF.getAttrList(attributes), attributes, defaultValue) result = readARFF.computeZeroR(attributes, testingData) for item in testingData : classValue = item[-1] #result = root.classify(item[0:-1], attributes) if result == defaultValue: continue elif result == classValue: if isPositive(result): TP +=1 else: TN +=1 else: if isPositive(result): FP +=1 else: FN +=1 if TP == 0 : precision = 0 recall = 0 elif (TP + TN) == 0: accuracy = 0 else : precision = float(TP)/float((TP + FP)) recall = float(TP)/float((TP + FN)) accuracy = float((TP + TN))/float((TP + FP + FN + TN)) #print precision, recall, accuracy return precision, recall, accuracy
trainSample = random.sample(index,int(len(data)*0.8)) testSample = [i for i in index if i not in trainSample] trainDataset = [data[i] for i in trainSample] testDataset = [data[i] for i in testSample] print "\nUsing ZeroR:" rz = evalZeroR(trainDataset,testDataset,classification,attrs) for k in rz: if k in resultZeroR: resultZeroR[k] += rz[k] else: resultZeroR[k] = rz[k] alist = [i.keys()[0] for i in attrs.values()] defaultValue = readARFF.computeZeroR(attrs,trainDataset) root = makeTree(trainDataset,attrs,defaultValue) print "\nTest Set: " r1 = evaluate(root,testDataset,alist, classification) for k in r1: if k in resultTest: resultTest[k] += r1[k] else: resultTest[k] = r1[k] print "\nTraining Set:" r2 = evaluate(root,trainDataset,alist, classification) for k in r1: if k in resultTrain: resultTrain[k] += r2[k] else: resultTrain[k] = r2[k]
if len(root.children) != 0: for k in root.children: child = root.children[k] printNode(child) if __name__ == '__main__': fileName = sys.argv[-1] attributes, data = readARFF.readArff(open(fileName)) listAttributes = readARFF.getAttrList(attributes) times = 5 total = zero = 0 precision = recall = precisionZero = recallZero = 0 for i in range(times): trainData = random.sample(data, int(len(data) * 0.8)) defaultValue = readARFF.computeZeroR(attributes, data) zeroRValue = readARFF.computeZeroR(attributes, trainData) root = makeTree(trainData, listAttributes, attributes, defaultValue) #printNode(root) TP = tp = 0 testData = [] for d in data: if d not in trainData: testData.append(d) for d in testData: value = root.classify(d, attributes) if value == d[-1]: TP += 1 if zeroRValue == d[-1]: tp += 1 accuracy = float(TP) / len(testData)