print(max_attr) for value, branch_data in branches_dict.items(): branches_nodes[value] = buildtree(branch_data, _remaining_attr, level + 1) return dtree.TreeNode(max_attr, branches_nodes, dtree.TreeLeaf(dtree.mostCommon(dataset))) # t = buildtree(m.monk1, m.attributes, 0) # drawtree_qt5.drawTree(t) t = dtree.buildTree(m.monk2, m.attributes) drawtree_qt5.drawTree(t) # Part 5: # for name, dset in datasets.items(): # t = dtree.buildTree(dset, m.attributes) # print(name, "train", round(1 - dtree.check(t, dset), 6), end=" ") # print(name, "test", round(1 - dtree.check(t, testsets[name]), 6)) # exit() ## PART 6 import random def partition(data, fraction): ldata = list(data)
inf_gain.append(asdf) del asdf print("Information Gain") pprint.table(inf_gain) print("\n") #4 Find the attribute that gives the max information gain. def find_max_gain(gain_matrix): max = 0.0 out = None for monk, row in enumerate(inf_gain): for attr, gain in enumerate(row): if gain > max: max = gain out = [monk + 1, attr + 1] if out is None: return [None, None, None] else: out.append(max) return out max_gain = find_max_gain(inf_gain) print("Max gain is at [monk, attr, value] = " + str(max_gain)) t1 = dt.buildTree(m.monk1, m.attributes) print(1 - dt.check(t1, m.monk1test)) # 0.8287037037037037 print(1 - dt.check(t1, m.monk1)) # 1.0 drawT.drawTree(t1)
print 'train set error: ' + str(round( (1 - d.check(t2, m.monk2)) * 100, 2)) + '%' print 'test set error: ' + str(round( (1 - d.check(t2, m.monk2test)) * 100, 2)) + '%' print '' print 'decision tree of monk3:' print(t3) print 'train set error: ' + str(round( (1 - d.check(t3, m.monk3)) * 100, 2)) + '%' print 'test set error: ' + str(round( (1 - d.check(t3, m.monk3test)) * 100, 2)) + '%' print '' #dqt.drawTree(t1) #dqt.drawTree(t2) dqt.drawTree(t3) ## Assignment 7: reduced error pruning fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] err_test1, var_test1 = d.errReducedPruned(m.monk1, m.monk1test, m.attributes, fraction, 500) err_test3, var_test3 = d.errReducedPruned(m.monk3, m.monk3test, m.attributes, fraction, 500) # plot results plt.figure(4) plt.subplot(2, 1, 1) plt.title('Mean Error vs. Fraction - 500 runs') plt.xlabel('Fraction') plt.ylabel('Mean Error') line1, = plt.plot(fraction, err_test1, 'bo-', label="monk 1")
BestAttribute = dtree.bestAttribute(monk3, m.attributes) print(BestAttribute) #monk1_A5_1 = dtree.select(monk1,m.attributes[5],1) #print(monk1_A5_1) monk1_tree = dtree.buildTree(monk1, m.attributes) #graf1 = dt.drawTree(monk1_tree) monk2_tree = dtree.buildTree(monk2, m.attributes) #graf2 = dt.drawTree(monk2_tree) monk3_tree = dtree.buildTree(monk3, m.attributes) #graf3 = dt.drawTree(monk3_tree) print(dtree.check(monk1_tree, m.monk1)) print(dtree.check(monk1_tree, m.monk1test)) print(dtree.check(monk2_tree, m.monk2test)) print(dtree.check(monk3_tree, m.monk3test)) def partition(data, fraction): ldata = list(data) random.shuffle(ldata) breakPoint = int(len(ldata) * fraction) return ldata[:breakPoint], ldata[breakpoint:] print(dtree.allPruned(monk1_tree)) dt.drawTree(dtree.allPruned()[1])
# FOR MONK-1 t1=d.buildTree(m.monk1, m.attributes); print("The error in the training set is {} and the error in the testing set is {} for MONK-1".format(1-d.check(t1, m.monk1), 1-d.check(t1, m.monk1test))) # FOR MONK-2 t2=d.buildTree(m.monk2, m.attributes); print("The error in the training set is {} and the error in the testing set is {} for MONK-2".format(1-d.check(t2, m.monk2), 1-d.check(t2, m.monk2test))) ## FOR MONK-3 t3=d.buildTree(m.monk3, m.attributes); print("The error in the training set is {} and the error in the testing set is {} for MONK-3".format(1-d.check(t3, m.monk3), 1-d.check(t3, m.monk3test))) # TREE DRAWING d5.drawTree(t1) d5.drawTree(t2) d5.drawTree(t3) # PRUNING import random import monkdata as m import dtree as d from dtree import allPruned def partition(data, fraction): ldata = list(data) random.shuffle(ldata) breakPoint = int(len(ldata) * fraction)
for dataset in datasets: print "Dataset: ", dataset['name'] decisionTree = None for maximumDepth in [2, 5, 10000]: decisionTree = dtree.buildTree(dataset['ref'], m.attributes, maximumDepth) print "Depth", maximumDepth print " Check Training Set: ", dtree.check(decisionTree, dataset['ref']) print " Check Test Set: ", dtree.check(decisionTree, dataset['test']) print " Decision Treee:\n ", decisionTree # uncomment if you want to see the tree or move to loop if you # want to see trees at all levels drawLib.drawTree(decisionTree) # Below is a try to design naive implementation of dtree.buildTree # using mostCommon, select and bestAttribute # get the best attribute in terms of information gain # split the dataset based on the values of the the selected attribute # we will have as many branches as the attribute values. # steps = [] # step = 0 # maxDepth = 2 # def split(subset, attribute, value): # print "Step: ", step # print "Attribute: ", attribute # print "Value: ", value
perf, dec_tree = prune(dec_tree, val_data) #Evaluate performace #On test dataset test_perf = misClasRate(dec_tree, train_data) #On trainingt dataset train_perf = misClasRate(dec_tree, dataset[k]) print("Dataset %d\n\tTest misc rate: %f\n\tTrain misc rate: %f" % (k + 1, test_perf, train_perf)) #Plot the Tree if k == 2: drawTree(dec_tree) ''' Accuracy Dataset 1 Test perf: 0.828704 Train perf: 1.000000 Dataset 2 Test perf: 0.692130 Train perf: 1.000000 Dataset 3 Test perf: 0.944444 Train perf: 1.000000 ''' ''' Misclassification rate
monk1train, monk1val = partition(m.monk3, fraction) print d.buildTree(m.monk3, m.attributes) def prunecont(tree, maxp): alt = d.allPruned(tree) prunecheck = range(len(alt)) maxprune = 0 indx = 999 for x in range(len(alt)): prunecheck[x] = d.check(alt[x], monk1val) if prunecheck[x] >= maxprune: maxprune = prunecheck[x] indx = x + 1 if maxprune >= maxp: print maxprune plt.figure() plt.plot(range(1, len(alt) + 1), prunecheck) plt.title('MONK-1') plt.xlabel('Pruning Alternatives') plt.ylabel('Pruning Accuracy') plt.show() return prunecont(alt[indx - 1], maxprune) else: return tree finaltree = prunecont(d.buildTree(monk1train, m.attributes), 0) print finaltree dt.drawTree(finaltree)
import monkdata as m import dtree as d import drawtree_qt5 as dt dt.drawTree(d.buildTree(m.monk3, m.attributes))
# print("Next level information gains:") # for i in range(6): # gain = dt.averageGain(monk, m.attributes[i]) # print("A" + str(i+1) + ": " + str(gain)) print("") best_atribute = dt.bestAttribute(m.monk1, m.attributes) for value in best_atribute.values: subset = dt.select(m.monk1, best_atribute, value) entropy = dt.entropy(subset) print("Attribute value:" + str(value)) for i in range(6): gain = dt.averageGain(subset, m.attributes[i]) print("A" + str(i+1) + ": " + str(gain)) print("") # Assignment 5 best_atribute = dt.bestAttribute(m.monk1, m.attributes) for value in best_atribute.values: subset = dt.select(m.monk1, best_atribute, value) best_atribute2 = dt.bestAttribute(subset, m.attributes) print(str(best_atribute) + " = " + str(value)) for value2 in best_atribute2.values: subset2 = dt.select(subset, best_atribute2, value2) common = dt.mostCommon(subset2) print(" " + str(best_atribute2) + "=" + str(value2) + ": " + str(common)) tree = dt.buildTree(monk1, m.attributes, 2) draw.drawTree(tree)