def calcNextTreeLevel(): selectedAttribute = m.attributes[4] s1 = dtree.select(m.monk1, selectedAttribute, 1) s2 = dtree.select(m.monk1, selectedAttribute, 2) s3 = dtree.select(m.monk1, selectedAttribute, 3) s4 = dtree.select(m.monk1, selectedAttribute, 4) # Calculate information gain of subsets #ASSIGNMENT3(s1) #ASSIGNMENT3(s2) #ASSIGNMENT3(s3) #ASSIGNMENT3(s4) mc1 = dtree.mostCommon(s1) mc2 = dtree.mostCommon(s2) mc3 = dtree.mostCommon(s3) mc4 = dtree.mostCommon(s4) #print(mc1) #print(mc2) #print(mc3) #print(mc4) tree = dtree.buildTree(m.monk2test, m.attributes) print(tree) draw.drawTree(tree)
def caspersky(dataset): print("Assignment 3") a = d.bestAttribute(dataset, m.attributes) branches = [] for v in a.values: s = d.select(dataset, a, v) tf = d.mostCommon(s) if tf == True: branches.append((v, d.TreeLeaf(s))) else: a2 = d.bestAttribute(s, m.attributes) branches2 = [] for v2 in a2.values: s2 = d.select(s, a2, v2) branches2.append((v2, d.TreeLeaf(d.mostCommon(s2)))) branches.append((v, d.TreeNode(a2, dict(branches2), d.mostCommon(s)))) drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
def buildtree(dataset, remaining_attr, level): if level == 2: return dtree.TreeLeaf(dtree.mostCommon(dataset)) max_attr, _ = getMaxGain(dataset, remaining_attr) branches_dict = dict([(value, dtree.select(dataset, max_attr, value)) for value in max_attr.values]) _remaining_attr = [a for a in remaining_attr if a != max_attr] branches_nodes = {} print(max_attr) for value, branch_data in branches_dict.items(): branches_nodes[value] = buildtree(branch_data, _remaining_attr, level + 1) return dtree.TreeNode(max_attr, branches_nodes, dtree.TreeLeaf(dtree.mostCommon(dataset)))
def find_splits(datasets, attributes, depth): print(len(datasets)) if (depth == 0): for i, dset in enumerate(datasets): print("Class for attribute value " + str(i + 1) + ": " + str(dtree.mostCommon(dset))) else: for dset in datasets: optimal = optimal_attr_split(dset, attributes) find_splits(get_data_subsets(dset, attributes, optimal), attributes, depth - 1)
def buildTreeCustom(dataset, depth): if (depth > 0): bestAttr = dt.bestAttribute(dataset, m.attributes) print(str(bestAttr), end='') # Select datasets splits for each value of the bestAttr splits = [] for value in bestAttr.values: splits.append(dt.select(dataset, bestAttr, value)) for split in splits: # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth if (dt.entropy(split) > 0): buildTreeCustom(split, depth - 1) else: print('+' if dt.mostCommon(split) else '-', end='') else: print('+' if dt.mostCommon(dataset) else '-', end='')
def makeTree(set, level, attributes): if level >= depth: return dtree.TreeLeaf(dtree.mostCommon(set)) attr = dtree.bestAttribute(set, attributes) node = [] branches = [] for val in attr.values: subset = dtree.select(set, attr, val) attributes_left = [a for a in attributes if a != attr] if dtree.allPositive(subset): node = dtree.TreeLeaf(True) elif dtree.allNegative(subset): node = dtree.TreeLeaf(False) else: node = makeTree(subset, level + 1, attributes_left) branches.append((val, node)) node = dtree.TreeNode(attr, dict(branches), dtree.mostCommon(set)) return node
def getLeaves(dataSet, a1, a2): a1_domain = m.attributes[a1].values a2_domain = m.attributes[a2].values for k in a1_domain: x = dtree.select(dataSet, m.attributes[a1], k) for l in a2_domain: y = dtree.select(x, m.attributes[a2], l) z = dtree.mostCommon(y) print("For " + str(k) + ":" + str(l) + ", " + "most common = " + str(z))
def PRINT_TREE_AT_LEVEL_2(): # A5 print(" ") print("LEVEL 1:") print(m.attributes[4]) Att = [None] * 4 for value in range(1, 5): Att[value - 1] = select(m.monk1, m.attributes[4], value) print("LEVEL 2:") for A in Att: tmp = bestAttribute(A, m.attributes) print(tmp) if tmp == m.attributes[0]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[1]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[2]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[3]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[4]: for value in range(1, 5): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[5]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) print(" ") t = buildTree(m.monk1, m.attributes) drawTree(t)
def split(node): #splitting sub_set_A5_value_1_m1 = d.select(m.monk1, node, 1) sub_set_A5_value_not_1_m1 = d.select(m.monk1, node, 2) + d.select( m.monk1, node, 3) + d.select(m.monk1, node, 4) + d.select( m.monk1, node, 5) #calculating gain to figure out which attribute to use in each of the next nodes information_gain_left = find_information_gain(sub_set_A5_value_1_m1, m.attributes) information_gain_right = find_information_gain(sub_set_A5_value_not_1_m1, m.attributes) information_gain = max(max(information_gain_left), max(information_gain_right)) #classifying the most common result in each sub tree majority_class_left = d.mostCommon(sub_set_A5_value_1_m1) majority_class_right = d.mostCommon(sub_set_A5_value_not_1_m1) print('left: ', majority_class_left) print('right: ', majority_class_right) print('information gain: ', information_gain)
def Tree(dataset, attributes, maxdepth=3): def Branch(dataset, default, attributes): if not dataset: return dtree.TreeLeaf(default) if dtree.allPositive(dataset): return dtree.TreeLeaf(True) if dtree.allNegative(dataset): return dtree.TreeLeaf(False) return Tree(dataset, attributes, maxdepth - 1) default = dtree.mostCommon(dataset) if maxdepth < 1: return dtree.TreeLeaf(default) a = dtree.bestAttribute(dataset, attributes) attributesLeft = [x for x in attributes if x != a] branches = [(v, Branch(dtree.select(dataset, a, v), default, attributesLeft)) for v in a.values] return dtree.TreeNode(a, dict(branches), default)
def buildTreeRec(dataset, attributes, depthtodo): defaultvalue = d.mostCommon(dataset) if d.allPositive(dataset): return d.TreeLeaf(True) elif d.allNegative(dataset): return d.TreeLeaf(False) elif (depthtodo <= 0): return d.TreeLeaf(defaultvalue) else: gainziplist = calculateGainTuplesForAllAttributes(dataset, attributes) maxgain, maxgainattribute = getTupleWithMaxGainValue(gainziplist) subnodes = [] for attrbutevalue in attributes[maxgainattribute].values: newdataset = d.select(dataset, attributes[maxgainattribute], attrbutevalue) subnode = buildTreeRec(newdataset, attributes, depthtodo - 1) subnodes.append((attrbutevalue, subnode)) return d.TreeNode(attributes[maxgainattribute], dict(subnodes), defaultvalue)
def calc_next_level(): #print "\nAverage gain when a5 is choosen" print "\nA5\t a1\t\t a2\t\t a3\t\t a4\t\t a5\t\t a6" s = "A5(" for val in data.attributes[4].values: subset = dt.select(data.monk1, data.attributes[4], val) t = "\t" for attr in data.attributes: t = t + "%.6f\t" % (dt.averageGain(subset, attr)) print val , t best = dt.bestAttribute(subset, data.attributes) s = s + best.name + "(" #print "best attribute: ", best.name for value in best.values: #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value)) if(dt.mostCommon(dt.select(subset, best, value))): s = s + "+" else: s = s + "-" s = s + ")" s = s + ")" print "\nOur tree:\t", s print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
print("subset a5_4 IG") for i in range(6): print(d.averageGain(a5_4, m.attributes[i])) a5_2_a4_1 = d.select(a5_2, m.attributes[3], 1) a5_2_a4_2 = d.select(a5_2, m.attributes[3], 2) a5_2_a4_3 = d.select(a5_2, m.attributes[3], 3) a5_3_a6_1 = d.select(a5_3, m.attributes[5], 1) a5_3_a6_2 = d.select(a5_3, m.attributes[5], 2) a5_4_a1_1 = d.select(a5_4, m.attributes[0], 1) a5_4_a1_2 = d.select(a5_4, m.attributes[0], 2) a5_4_a1_3 = d.select(a5_4, m.attributes[0], 3) print(d.mostCommon(a5_1)) print(d.mostCommon(a5_2)) print(d.mostCommon(a5_3)) print(d.mostCommon(a5_4)) print(" ") print(d.mostCommon(a5_2_a4_1)) print(d.mostCommon(a5_2_a4_2)) print(d.mostCommon(a5_2_a4_3)) print(" ") print(d.mostCommon(a5_3_a6_1)) print(d.mostCommon(a5_3_a6_2)) print(" ") print(d.mostCommon(a5_4_a1_1)) print(d.mostCommon(a5_4_a1_2)) print(d.mostCommon(a5_4_a1_3))
print("3", dt.averageGain(list2, m.attributes[3])) print("5", dt.averageGain(list2, m.attributes[5])) print("list3") print("0", dt.averageGain(list3, m.attributes[0])) print("1", dt.averageGain(list3, m.attributes[1])) print("2", dt.averageGain(list3, m.attributes[2])) print("3", dt.averageGain(list3, m.attributes[3])) print("5", dt.averageGain(list3, m.attributes[5])) print("list4") print("0", dt.averageGain(list4, m.attributes[0])) print("1", dt.averageGain(list4, m.attributes[1])) print("2", dt.averageGain(list4, m.attributes[2])) print("3", dt.averageGain(list4, m.attributes[3])) print("5", dt.averageGain(list4, m.attributes[5])) common = dt.mostCommon(dt.select(list4, m.attributes[0], 1)) common2 = dt.mostCommon(dt.select(list4, m.attributes[0], 2)) common3 = dt.mostCommon(dt.select(list4, m.attributes[0], 3)) print("attribute val 1 is mostly ", common) print("attribute val 2 is mostly ", common2) print("attribute val 3 is mostly ", common3) t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes) #t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes, maxdepth=5) #draw.drawTree(t1) t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes) t2 = dt.buildTree(dataset=m.monk2, attributes=m.attributes) t3 = dt.buildTree(dataset=m.monk3, attributes=m.attributes)
def main(): # Assignement 1 print("Assignement 1") monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3] monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test] entropies = [dtree.entropy(monk) for monk in monks] print("*** Monk1 entropy: ", entropies[0]) print("*** Monk2 entropy: ", entropies[1]) print("*** Monk3 entropy: ", entropies[2]) # Assignement 3 print(" ") print("Assignement 3") attributes = monkdata.attributes info_gain1 = info_gain(monks[0], attributes) info_gain2 = info_gain(monks[1], attributes) info_gain3 = info_gain(monks[2], attributes) print("*** Monk1 information gain for attribute:", ['%.5f' % x for x in info_gain1]) print("*** Monk2 information gain for attribute:", ['%.5f' % x for x in info_gain2]) print("*** Monk3 information gain for attribute:", ['%.5f' % x for x in info_gain3]) # Assignement 5 print("") print("Assignement 5") print("*** Attribute:", np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset") print("*** Attribute:", np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset") print("*** Attribute:", np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset") print("***") max0 = np.argmax(info_gain1) # attribute of first split attributes_left = [ attrib for attrib in attributes if attrib != attributes[max0] ] print("*** 1) Attributes the next nodes should be tested on: ", attributes_left) # Attributes to split on in second step splits = [ np.argmax( info_gain(dtree.select(monks[0], attributes[max0], value), attributes)) + 1 for value in attributes[max0].values ] print("*** 2) Second split is on the attriburtes: ", splits) # Decision after second split subsets = [ dtree.select(monks[0], attributes[max0], split) for split in splits ] print("*** 3) Assignement after second split: ", [dtree.mostCommon(subset) for subset in subsets]) print("***") print("*** Train and test set errors") t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes) print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes) print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2), " Etest=", 1 - dtree.check(t2, monkdata.monk2test)) t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes) print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3), " Etest=", 1 - dtree.check(t3, monkdata.monk3test)) import drawtree_qt5 #print(t1) # tree in text form(weird) #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree # Assignement 7 print("") print("Assignement 7") # The prunning for the exanple of monk1 monk1train, monk1val = partition(monkdata.monk1, 0.9) t1 = dtree.buildTree(monk1train, monkdata.attributes) # tree trained from monk1train t11 = prune(t1, monk1val) # prunned tree print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=", 1 - dtree.check(t11, monkdata.monk1test)) # Statistic information for different fraction for monk1 and monk3 fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] # Evaluation of Monk1 eval1 = [ evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test) for frac in fraction ] means1 = [np.mean(x) for x in eval1] vars1 = [np.var(x) for x in eval1] plt.figure(1) plt.subplot(121) plt.plot(fraction, means1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk1') # Evaluation of Monk2 eval3 = [ evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test) for frac in fraction ] means3 = [np.mean(x) for x in eval3] vars3 = [np.var(x) for x in eval3] plt.figure(2) plt.subplot(121) plt.plot(fraction, means3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk2') plt.show()
uncertainty the most. Thus, it should be used for splitting at the root node. """ "5 BUILDING DECISION TREES" sel = [] for i in range(4): # splits data into subset according to attr a5 sel.append(t.select(m.monk1, m.attributes[4], m.attributes[4].values[i])) # print(sel) sub = [] mC = [] for subset in sel: for i in [0, 1, 2, 3, 5]: sub.append(t.averageGain(subset, m.attributes[i])) mC.append(t.mostCommon(subset)) # print(sub) sub = [] "Highest information gain on second level of the tree # 2 - A4 , 3 - A6 , 4 - A1 #" """Assignment 3""" tree1 = t.buildTree(m.monk1, m.attributes) tree2 = t.buildTree(m.monk2, m.attributes) tree3 = t.buildTree(m.monk3, m.attributes) draw.drawTree(tree1) # draw.drawTree(tree2) # draw.drawTree(tree3)
# ATTRIBUTE A2 IN MONK-3 # A2 HAS VALUES {1,2,3} monk3_1=select(m.monk1,m.attributes[1],1) # MONK-3 dataset where a2=1 monk3_2=select(m.monk1,m.attributes[1],2) # MONK-3 dataset where a2=2 monk3_3=select(m.monk1,m.attributes[1],3) # MONK-3 dataset where a2=3 # INFORMATION GAIN CALCULATION AFTER SPLITTING # FOR MONK-1 SPLITTINGS info_gain1_1=[0]*6 for i in range(6): info_gain1_1[i]=averageGain(monk1_1,m.attributes[i]) print("Information gain in MONK-1 tree for a5=1 for a{} is {}".format(i+1,info_gain1_1[i])) m1_1=mostCommon(monk1_1) print("The most common output in MONK-1 for a5=1 is {}".format(m1_1)) info_gain1_2=[0]*6 for i in range(6): info_gain1_2[i]=averageGain(monk1_2,m.attributes[i]) print("Information gain in MONK-1 tree for a5=2 for a{} is {}".format(i+1,info_gain1_2[i])) m1_2=mostCommon(monk1_2) print("The most common output in MONK-1 for a5=2 is {}".format(m1_2)) info_gain1_3=[0]*6 for i in range(6): info_gain1_3[i]=averageGain(monk1_3,m.attributes[i]) print("Information gain in MONK-1 tree for a5=3 for a{} is {}".format(i+1,info_gain1_3[i])) m1_3=mostCommon(monk1_3) print("The most common output in MONK-1 for a5=3 is {}".format(m1_3))
print("Entropy " + str(value) + ": " + str(entropy)) # print("Next level information gains:") # for i in range(6): # gain = dt.averageGain(monk, m.attributes[i]) # print("A" + str(i+1) + ": " + str(gain)) print("") best_atribute = dt.bestAttribute(m.monk1, m.attributes) for value in best_atribute.values: subset = dt.select(m.monk1, best_atribute, value) entropy = dt.entropy(subset) print("Attribute value:" + str(value)) for i in range(6): gain = dt.averageGain(subset, m.attributes[i]) print("A" + str(i+1) + ": " + str(gain)) print("") # Assignment 5 best_atribute = dt.bestAttribute(m.monk1, m.attributes) for value in best_atribute.values: subset = dt.select(m.monk1, best_atribute, value) best_atribute2 = dt.bestAttribute(subset, m.attributes) print(str(best_atribute) + " = " + str(value)) for value2 in best_atribute2.values: subset2 = dt.select(subset, best_atribute2, value2) common = dt.mostCommon(subset2) print(" " + str(best_atribute2) + "=" + str(value2) + ": " + str(common)) tree = dt.buildTree(monk1, m.attributes, 2) draw.drawTree(tree)
import monkdata as m import dtree as dtree foo = dtree.select(m.monk1, m.attributes[4], 3) print '-- information gain of monk-1 dataset: --' print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0])) print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1])) print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2])) print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3])) print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5])) foo = dtree.select(m.monk1, m.attributes[4], 1) print '-- is a_5 with value = 1 a majority class? --' print dtree.mostCommon(foo)
print("Average gain in dataset monk2 and attribute " + str(x.name) + " is %.6f" % ag2) print("Average gain in dataset monk3 and attribute " + str(x.name) + " is %.6f" % ag3) print("\n") for x in range(1, 5): highest_avg = 0 highest_attribute = 0 s = dtree.select(mdata.monk1, mdata.attributes[4], x) for y in mdata.attributes: avg_g = dtree.averageGain(s, y) print("Average gain in dataset monk1 and subset s" + str(x) + " and attribute " + str(y.name) + " is %.6f. Majority: " % avg_g + str(dtree.mostCommon(s))) if (avg_g > highest_avg): highest_avg = avg_g highest_attribute = int(y.name[1]) print("Highest avg: %.6f in attr: " % highest_avg + str(highest_attribute)) for z in range( 1, len(mdata.attributes[int(highest_attribute - 1)].values) + 1): s2 = dtree.select(s, mdata.attributes[int(highest_attribute - 1)], z) print(dtree.mostCommon(s2)) print("\n") t1 = dtree.buildTree(mdata.monk1, mdata.attributes) print("Test data check: %.6f\n" % dtree.check(t1, mdata.monk1test))
import monkdata as m import dtree as dt import drawtree as draw entropy = dt.entropy(m.monk1) best_gain = 0 for attribute in m.attributes: gain = dt.averageGain(m.monk1, attribute) if gain > best_gain: best_gain = gain best_attribute = attribute for v in best_attribute.values: subset = dt.select(m.monk1, best_attribute, v) majority_class = dt.mostCommon(subset) values = { v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values } print(best_attribute, values) draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))
def printNumTrueFalse(datasets): # For a list of datasets, print the number of true and false for i in range(0, len(datasets)): print("Monk"+str(i+1)+" "+ "[#tot="+str(len(datasets[i]))+"] "+ "[#true="+str(getNumTrue(datasets[i]))+"] "+ "[#false="+str(getNumFalse(datasets[i]))+"]") #Main dataset = m.monk2 available = [True]*len(m.attributes) firstSplit = getBestAttribute(dataset, m.attributes, available) print("Firstsplit = "+str(firstSplit)) print("-----") available[firstSplit] = False sets = [] for i in range(0, len(m.attributes[firstSplit].values)): sets.append(d.select(dataset, m.attributes[firstSplit], m.attributes[firstSplit].values[i])) for i in range(0, len(sets)): subSets = [] splitOn = getBestAttribute(sets[i], m.attributes, available) print("Second split = "+str(splitOn)) for j in range(0, len(m.attributes[splitOn].values)): subSets.append(d.select(sets[i], m.attributes[i], m.attributes[i].values[j])) for s in subSets: print(d.mostCommon(s)) print("----")
monkData = { 'monk1_2': { 'data': monk1_2, 'branch': 3 }, 'monk1_3': { 'data': monk1_3, 'branch': 5 }, 'monk1_4': { 'data': monk1_4, 'branch': 0 } } monk1_maj = d.mostCommon(m.monk1) monk1_1_maj = d.mostCommon(monk1_1) print('majority class for monk1: ' + str(monk1_maj)) print('majority class for monk1_1: ' + str(monk1_1_maj)) for mo in monkData: moName = mo attrNo = monkData[mo]['branch'] monk_maj = d.mostCommon(monkData[mo]['data']) print('majority class for ' + mo + ': ' + str(monk_maj)) for attrVal in m.attributes[attrNo].values: monkTmp = d.select(monkData[mo]['data'], m.attributes[attrNo], attrVal) monk_maj = d.mostCommon(monkTmp) print('majority class for ' + mo + ', partition ' + str(attrVal) + ': ' + str(monk_maj))
import dtree as d #import drawtree print("3. Building decision tree: \n") print("Subset division of MONK-1 at attribute 5: \n") subsets = []; for x in range(0, len(m.attributes[4].values)): subsets.append(d.select(m.monk1, m.attributes[4], x+1)) for set in subsets: gain = 0; maxgain = 0; bestatr = 0; print("Value: %d" % (subsets.index(set) + 1)) print("Most common: " + str(d.mostCommon(set))) for x in range(0, len(m.attributes)): gain = d.averageGain(set, m.attributes[x]); print("Attribute A%d: %f" % (x+1, gain)) if(gain > maxgain): maxgain = gain; bestatr = x; print("Attribute with best information gain: A%d \n" % (bestatr + 1)); maxgain = 0 bestatr = 0; print("MONK-1:") t = d.buildTree(m.monk1, m.attributes) print("Testing set error %f: " % (1 - d.check(t, m.monk1test))) print("Training set error %f: \n" % (1 - d.check(t, m.monk1)))
for i, dataset in enumerate(train): print("") print("Average gain for monk{} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute))) monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)] for i, monk1 in enumerate(monk1a5): print("") print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): if j != 4: print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute))) print("Majority class = {}".format(dt.mostCommon(monk1))) # Building the decision tree. tree1 = dt.buildTree(monk.monk1, monk.attributes) tree2 = dt.buildTree(monk.monk2, monk.attributes) tree3 = dt.buildTree(monk.monk3, monk.attributes) trees = [tree1, tree2, tree3] # Drawing the decision tree. #drawtree.drawTree(tree) print("") for i, (dataset1, dataset2, tree) in enumerate(zip(train, test, trees)): print("Error for Monk{} on train = {} and on test {}.".format(i, dt.check(tree, dataset1), dt.check(tree, dataset2)))
# Ass. 5 infoGains2 = numpy.zeros((4, 6)) subsets = [0 for x in range(0, 4)] for i in range(1, 5): subsets[i - 1] = dtree.select(m.monk1, m.attributes[4], i) for j in range(0, 6): infoGains2[i - 1, j] = dtree.averageGain(subsets[i - 1], m.attributes[j]) print("Information gains for each subset:\n", infoGains2) # subset[1] subsets1 = [0 for x in range(0, 3)] #a5==2 for i in range(1, 4): subsets1[i - 1] = dtree.select(m.monk1, m.attributes[3], i) print(dtree.mostCommon(subsets1[i - 1])) subsets2 = [0 for x in range(0, 2)] for i in range(1, 3): subsets2[i - 1] = dtree.select(m.monk1, m.attributes[5], i) print(dtree.mostCommon(subsets2[i - 1])) subsets3 = [0 for x in range(0, 3)] for i in range(1, 4): subsets3[i - 1] = dtree.select(m.monk1, m.attributes[0], i) print(dtree.mostCommon(subsets3[i - 1])) print(dtree.buildTree(m.monk1, m.attributes, 2)) # problem! inconsistent #drawtree_qt5.drawTree(dtree.buildTree(m.monk1, m.attributes, 2)) t = dtree.buildTree(m.monk1, m.attributes)
print "Gain Monk1 a5(3) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[0])) print "Gain Monk1 a5(3) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[1])) print "Gain Monk1 a5(3) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[2])) print "Gain Monk1 a5(3) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[3])) print "Gain Monk1 a5(3) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[4])) print "Gain Monk1 a5(3) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[5])) print "Gain Monk1 a5(4) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[0])) print "Gain Monk1 a5(4) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[1])) print "Gain Monk1 a5(4) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[2])) print "Gain Monk1 a5(4) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[3])) print "Gain Monk1 a5(4) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[4])) print "Gain Monk1 a5(4) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[5])) selec1 = tree.select(m.monk1, m.attributes[4], 4) print "Most Common Level2 Monk1(1): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],1))) print "Most Common Level2 Monk1(2): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],2))) print "Most Common Level2 Monk1(3): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],3))) print "Monk 1 Etrain : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1)) print "Monk 1 Etest : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1test)) print "Monk 2 Etrain : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2)) print "Monk 2 Etest : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2test)) print "Monk 3 Etrain : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3)) print "Monk 3 Etest : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3test)) print "ID3 built tree : \n" tree1 = tree.buildTree(m.monk1,m.attributes,2) #d.drawTree(tree1) #x = [0.3,0.4,0.5,0.6,0.7,0.8]
for x in range(0, 6): gain_partition1.append(dt.averageGain(partition1,m.attributes[x])) gain_partition2.append(dt.averageGain(partition2,m.attributes[x])) gain_partition3.append(dt.averageGain(partition3,m.attributes[x])) gain_partition4.append(dt.averageGain(partition4,m.attributes[x])) print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6" print "Part 1: ","\t".join(["%.7f"%y for y in gain_partition1]) print "Part 2: ","\t".join(["%.7f"%y for y in gain_partition2]) print "Part 3: ","\t".join(["%.7f"%y for y in gain_partition3]) print "Part 4: ","\t".join(["%.7f"%y for y in gain_partition4]) print print "Own tree" print "A5(",dt.mostCommon(partition1),"A4(",dt.mostCommon(partition2),")","A6",dt.mostCommon(partition3),")","A1(",dt.mostCommon(partition4), "))" print print "BuildTree function" print dt.buildTree(m.monk1,m.attributes,2) #draw.drawTree(dt.buildTree(m.monk1,m.attributes,2)) print print "Building Trees" t1 = dt.buildTree(m.monk1,m.attributes) t2 = dt.buildTree(m.monk2,m.attributes) t3 = dt.buildTree(m.monk3,m.attributes) print "Checking Full Tree" print "Dataset\tE train\t\tE test" print "Monk1\t","%.7f"%dt.check(t1,m.monk1), "\t%.7f"%dt.check(t1,m.monk1test)
print("monk-2: %f %f %f %f %f %f" % ( d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]) )) print("monk-3: %f %f %f %f %f %f" % ( d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]), d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]), d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5]) )) monk1_subset = d.select(m.monk1, m.attributes[4], 3) print len(monk1_subset) print(d.mostCommon(monk1_subset)) monk1_subset_tree = d.buildTree(monk1_subset, m.attributes, 5) print(monk1_subset_tree) t1 = d.buildTree(m.monk1, m.attributes); print(d.check(t1, m.monk1test)) print(d.check(t1, m.monk1)) t2 = d.buildTree(m.monk2, m.attributes); print(d.check(t2, m.monk2test)) print(d.check(t2, m.monk2)) t3 = d.buildTree(m.monk3, m.attributes); print(d.check(t3, m.monk3test)) print(d.check(t3, m.monk3))
def mostCommonAfterSplit(dataset, attributeNumber): return [dtree.mostCommon(x) for x in splitDataset(dataset, attributeNumber)]
print("Best attribute for split:", max(gains, key=gains.get)) print() gains = dict( zip(md.attributes, [dt.averageGain(md.monk3, a) for a in md.attributes])) print("Information gains MONK-3:", gains) print("Best attribute for split:", max(gains, key=gains.get)) print() # building tree print("\n----------DECISION TREE MONK-1 DEPTH 2----------\n") for v in selected.values: print(selected, "=", v) subset = dt.select(md.monk1, selected, v) if dt.allPositive(subset) or dt.allNegative(subset): print(selected, "=", v, "->", dt.mostCommon(subset)) else: attributes_left = [a for a in md.attributes if a != selected] gains = dict( zip(attributes_left, [dt.averageGain(subset, a) for a in attributes_left])) print("Information gains:", gains) best = max(gains, key=gains.get) print("Best attribute for split:", best) for v2 in best.values: print(best, "=", v2, "->", dt.mostCommon(dt.select(subset, best, v2))) print() # dr.drawTree(dt.buildTree(md.monk1, md.attributes, 2)) print(dt.buildTree(md.monk1, md.attributes, 2))
import monkdata as m import dtree as dt import drawtree as draw entropy = dt.entropy(m.monk1) best_gain = 0 for attribute in m.attributes: gain = dt.averageGain(m.monk1, attribute) if gain > best_gain: best_gain = gain best_attribute = attribute for v in best_attribute.values: subset = dt.select(m.monk1, best_attribute, v) majority_class = dt.mostCommon(subset) values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values} print(best_attribute, values) draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))
for i in range(0, len(datasets)): print("Monk" + str(i + 1) + " " + "[#tot=" + str(len(datasets[i])) + "] " + "[#true=" + str(getNumTrue(datasets[i])) + "] " + "[#false=" + str(getNumFalse(datasets[i])) + "]") #Main dataset = m.monk2 available = [True] * len(m.attributes) firstSplit = getBestAttribute(dataset, m.attributes, available) print("Firstsplit = " + str(firstSplit)) print("-----") available[firstSplit] = False sets = [] for i in range(0, len(m.attributes[firstSplit].values)): sets.append( d.select(dataset, m.attributes[firstSplit], m.attributes[firstSplit].values[i])) for i in range(0, len(sets)): subSets = [] splitOn = getBestAttribute(sets[i], m.attributes, available) print("Second split = " + str(splitOn)) for j in range(0, len(m.attributes[splitOn].values)): subSets.append( d.select(sets[i], m.attributes[i], m.attributes[i].values[j])) for s in subSets: print(d.mostCommon(s)) print("----")