def calcNextTreeLevel(): selectedAttribute = m.attributes[4] s1 = dtree.select(m.monk1, selectedAttribute, 1) s2 = dtree.select(m.monk1, selectedAttribute, 2) s3 = dtree.select(m.monk1, selectedAttribute, 3) s4 = dtree.select(m.monk1, selectedAttribute, 4) # Calculate information gain of subsets #ASSIGNMENT3(s1) #ASSIGNMENT3(s2) #ASSIGNMENT3(s3) #ASSIGNMENT3(s4) mc1 = dtree.mostCommon(s1) mc2 = dtree.mostCommon(s2) mc3 = dtree.mostCommon(s3) mc4 = dtree.mostCommon(s4) #print(mc1) #print(mc2) #print(mc3) #print(mc4) tree = dtree.buildTree(m.monk2test, m.attributes) print(tree) draw.drawTree(tree)
def getLeaves(dataSet, a1, a2): a1_domain = m.attributes[a1].values a2_domain = m.attributes[a2].values for k in a1_domain: x = dtree.select(dataSet, m.attributes[a1], k) for l in a2_domain: y = dtree.select(x, m.attributes[a2], l) z = dtree.mostCommon(y) print("For " + str(k) + ":" + str(l) + ", " + "most common = " + str(z))
def getSubsets(set, n): values = m.attributes[n].values subsets = [] for val in values: subsets.append(dtree.select(m.monk1, m.attributes[4], val)) # print(dtree.select(m.monk1, m.attributes[4], val)) return subsets
def entropy_matrix(datasets, attribute_index, max_att_list): entropy_matrix = np.zeros( (len(datasets), len(m.attributes[attribute_index].values))) for idx, dataset in enumerate(datasets): att = m.attributes[max_att_list[idx]] for j, v in enumerate(att.values): entropy_matrix[idx, j] = d.entropy(d.select(dataset, att, v)) print(entropy_matrix)
def split_tree_by_attribute_and_value(dataset, attribute_idx): attribute_values = m.attributes[attribute_idx].values attribute_values_list = [[i] for i in list(attribute_values)] dataset_by_attribute_and_value = [] for value in attribute_values: dataset_by_attribute_and_value.append( d.select(dataset, m.attributes[attribute_idx], value)) return dataset_by_attribute_and_value, attribute_values_list
def get_data_subsets(data, attributes, split_attribute): subsets = [] attr = mdata.attributes[split_attribute] # extract the key for the given attribute for val in attributes[split_attribute].values: subsets.append(dtree.select(data, attr, val)) print("Subset sizes: " + str(map(len, subsets))) return subsets
def PRINT_TREE_AT_LEVEL_2(): # A5 print(" ") print("LEVEL 1:") print(m.attributes[4]) Att = [None] * 4 for value in range(1, 5): Att[value - 1] = select(m.monk1, m.attributes[4], value) print("LEVEL 2:") for A in Att: tmp = bestAttribute(A, m.attributes) print(tmp) if tmp == m.attributes[0]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[1]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[2]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[3]: for value in range(1, 4): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[4]: for value in range(1, 5): print(mostCommon(select(A, tmp, value))) if tmp == m.attributes[5]: for value in range(1, 3): print(mostCommon(select(A, tmp, value))) print(" ") t = buildTree(m.monk1, m.attributes) drawTree(t)
def splitOnA5AndComputeInformationGainsOfSubsets(): """ Assignment 3: Split on attribute 5 (A5) and compute the gains of the subsets. """ a5 = m.attributes[4] for set in trainingSets: for attributeValue in a5.values: subset = d.select(set.dataset, a5, attributeValue) printInformationGainOfDataset( subset, set.name + " splitted on A5 = " + str(attributeValue))
def caspersky(dataset): print("Assignment 3") a = d.bestAttribute(dataset, m.attributes) branches = [] for v in a.values: s = d.select(dataset, a, v) tf = d.mostCommon(s) if tf == True: branches.append((v, d.TreeLeaf(s))) else: a2 = d.bestAttribute(s, m.attributes) branches2 = [] for v2 in a2.values: s2 = d.select(s, a2, v2) branches2.append((v2, d.TreeLeaf(d.mostCommon(s2)))) branches.append((v, d.TreeNode(a2, dict(branches2), d.mostCommon(s)))) drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
def split(node): #splitting sub_set_A5_value_1_m1 = d.select(m.monk1, node, 1) sub_set_A5_value_not_1_m1 = d.select(m.monk1, node, 2) + d.select( m.monk1, node, 3) + d.select(m.monk1, node, 4) + d.select( m.monk1, node, 5) #calculating gain to figure out which attribute to use in each of the next nodes information_gain_left = find_information_gain(sub_set_A5_value_1_m1, m.attributes) information_gain_right = find_information_gain(sub_set_A5_value_not_1_m1, m.attributes) information_gain = max(max(information_gain_left), max(information_gain_right)) #classifying the most common result in each sub tree majority_class_left = d.mostCommon(sub_set_A5_value_1_m1) majority_class_right = d.mostCommon(sub_set_A5_value_not_1_m1) print('left: ', majority_class_left) print('right: ', majority_class_right) print('information gain: ', information_gain)
def buildTree(subset,attrs): global tree if isLeaf(subset): tree = (tree + '+') if d.allPositive(subset) else (tree + '-') return else: root = d.bestAttribute(subset,attrs) tree = tree + str(root) + "(" for value in root.values: nextSubset = d.select(subset,root,value) nextAttrs = attrs - set([root]) buildTree(nextSubset,nextAttrs) tree = tree + ")"
def calc_next_level(): #print "\nAverage gain when a5 is choosen" print "\nA5\t a1\t\t a2\t\t a3\t\t a4\t\t a5\t\t a6" s = "A5(" for val in data.attributes[4].values: subset = dt.select(data.monk1, data.attributes[4], val) t = "\t" for attr in data.attributes: t = t + "%.6f\t" % (dt.averageGain(subset, attr)) print val , t best = dt.bestAttribute(subset, data.attributes) s = s + best.name + "(" #print "best attribute: ", best.name for value in best.values: #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value)) if(dt.mostCommon(dt.select(subset, best, value))): s = s + "+" else: s = s + "-" s = s + ")" s = s + ")" print "\nOur tree:\t", s print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
def buildTreeCustom(dataset, depth): if (depth > 0): bestAttr = dt.bestAttribute(dataset, m.attributes) print(str(bestAttr), end='') # Select datasets splits for each value of the bestAttr splits = [] for value in bestAttr.values: splits.append(dt.select(dataset, bestAttr, value)) for split in splits: # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth if (dt.entropy(split) > 0): buildTreeCustom(split, depth - 1) else: print('+' if dt.mostCommon(split) else '-', end='') else: print('+' if dt.mostCommon(dataset) else '-', end='')
def Tree(dataset, attributes, maxdepth=3): def Branch(dataset, default, attributes): if not dataset: return dtree.TreeLeaf(default) if dtree.allPositive(dataset): return dtree.TreeLeaf(True) if dtree.allNegative(dataset): return dtree.TreeLeaf(False) return Tree(dataset, attributes, maxdepth - 1) default = dtree.mostCommon(dataset) if maxdepth < 1: return dtree.TreeLeaf(default) a = dtree.bestAttribute(dataset, attributes) attributesLeft = [x for x in attributes if x != a] branches = [(v, Branch(dtree.select(dataset, a, v), default, attributesLeft)) for v in a.values] return dtree.TreeNode(a, dict(branches), default)
def makeTree(set, level, attributes): if level >= depth: return dtree.TreeLeaf(dtree.mostCommon(set)) attr = dtree.bestAttribute(set, attributes) node = [] branches = [] for val in attr.values: subset = dtree.select(set, attr, val) attributes_left = [a for a in attributes if a != attr] if dtree.allPositive(subset): node = dtree.TreeLeaf(True) elif dtree.allNegative(subset): node = dtree.TreeLeaf(False) else: node = makeTree(subset, level + 1, attributes_left) branches.append((val, node)) node = dtree.TreeNode(attr, dict(branches), dtree.mostCommon(set)) return node
def buildtree(dataset, remaining_attr, level): if level == 2: return dtree.TreeLeaf(dtree.mostCommon(dataset)) max_attr, _ = getMaxGain(dataset, remaining_attr) branches_dict = dict([(value, dtree.select(dataset, max_attr, value)) for value in max_attr.values]) _remaining_attr = [a for a in remaining_attr if a != max_attr] branches_nodes = {} print(max_attr) for value, branch_data in branches_dict.items(): branches_nodes[value] = buildtree(branch_data, _remaining_attr, level + 1) return dtree.TreeNode(max_attr, branches_nodes, dtree.TreeLeaf(dtree.mostCommon(dataset)))
def buildTreeRec(dataset, attributes, depthtodo): defaultvalue = d.mostCommon(dataset) if d.allPositive(dataset): return d.TreeLeaf(True) elif d.allNegative(dataset): return d.TreeLeaf(False) elif (depthtodo <= 0): return d.TreeLeaf(defaultvalue) else: gainziplist = calculateGainTuplesForAllAttributes(dataset, attributes) maxgain, maxgainattribute = getTupleWithMaxGainValue(gainziplist) subnodes = [] for attrbutevalue in attributes[maxgainattribute].values: newdataset = d.select(dataset, attributes[maxgainattribute], attrbutevalue) subnode = buildTreeRec(newdataset, attributes, depthtodo - 1) subnodes.append((attrbutevalue, subnode)) return d.TreeNode(attributes[maxgainattribute], dict(subnodes), defaultvalue)
def assignment4(): datasets = [ (m.monk1, 'monk1', m.attributes[0]), (m.monk1, 'monk1', m.attributes[1]), (m.monk1, 'monk1', m.attributes[2]), (m.monk1, 'monk1', m.attributes[3]), (m.monk1, 'monk1 max', m.attributes[4]), ] for data, name, attribute in datasets: summ = 0 for value in attribute.values: subset = dtree.select(data, attribute, value) print(f'Entropy of S{value} for {name}:\t{dtree.entropy(subset)}') summ += len(subset) / len(data) * dtree.entropy(subset) print(dtree.entropy(data) - summ) print()
print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0])) print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1])) print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2])) print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3])) print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4])) print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5])) print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0])) print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1])) print "Gain Monk3 a3: " + str(tree.averageGain(m.monk3,m.attributes[2])) print "Gain Monk3 a4: " + str(tree.averageGain(m.monk3,m.attributes[3])) print "Gain Monk3 a5: " + str(tree.averageGain(m.monk3,m.attributes[4])) print "Gain Monk3 a6: " + str(tree.averageGain(m.monk3,m.attributes[5])) print "Gain Monk1 a5(1) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[0])) print "Gain Monk1 a5(1) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[1])) print "Gain Monk1 a5(1) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[2])) print "Gain Monk1 a5(1) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[3])) print "Gain Monk1 a5(1) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[4])) print "Gain Monk1 a5(1) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[5])) print "Gain Monk1 a5(2) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[0])) print "Gain Monk1 a5(2) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[1])) print "Gain Monk1 a5(2) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[2])) print "Gain Monk1 a5(2) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[3])) print "Gain Monk1 a5(2) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[4])) print "Gain Monk1 a5(2) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[5])) print "Gain Monk1 a5(3) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[0])) print "Gain Monk1 a5(3) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[1]))
print(d.check(t, m.monk1)) print(d.check(t, m.monk1test)) t = d.buildTree(m.monk2, m.attributes) print(d.check(t, m.monk2)) print(d.check(t, m.monk2test)) t = d.buildTree(m.monk3, m.attributes) print(d.check(t, m.monk3)) print(d.check(t, m.monk3test)) print("First Node IG") for i in range(0, 6): print(d.averageGain(m.monk1, m.attributes[i])) a5_1 = d.select(m.monk1, m.attributes[4], 1) a5_2 = d.select(m.monk1, m.attributes[4], 2) a5_3 = d.select(m.monk1, m.attributes[4], 3) a5_4 = d.select(m.monk1, m.attributes[4], 4) print("subset a5_1 IG") for i in range(6): print(d.averageGain(a5_1, m.attributes[i])) print("subset a5_2 IG") for i in range(6): print(d.averageGain(a5_2, m.attributes[i])) print("subset a5_3 IG") for i in range(6): print(d.averageGain(a5_3, m.attributes[i]))
print("0", dt.averageGain(m.monk2, m.attributes[0])) print("1", dt.averageGain(m.monk2, m.attributes[1])) print("2", dt.averageGain(m.monk2, m.attributes[2])) print("3", dt.averageGain(m.monk2, m.attributes[3])) print("4", dt.averageGain(m.monk2, m.attributes[4])) print("5", dt.averageGain(m.monk2, m.attributes[5])) print("monk3") print("0", dt.averageGain(m.monk3, m.attributes[0])) print("1", dt.averageGain(m.monk3, m.attributes[1])) print("2", dt.averageGain(m.monk3, m.attributes[2])) print("3", dt.averageGain(m.monk3, m.attributes[3])) print("4", dt.averageGain(m.monk3, m.attributes[4])) print("5", dt.averageGain(m.monk3, m.attributes[5])) list1 = dt.select(m.monk1, m.attributes[4], 1) list2 = dt.select(m.monk1, m.attributes[4], 2) list3 = dt.select(m.monk1, m.attributes[4], 3) list4 = dt.select(m.monk1, m.attributes[4], 4) print("gains") print("list1") print("0", dt.averageGain(list1, m.attributes[0])) print("1", dt.averageGain(list1, m.attributes[1])) print("2", dt.averageGain(list1, m.attributes[2])) print("3", dt.averageGain(list1, m.attributes[3])) print("5", dt.averageGain(list1, m.attributes[5])) print("list2") print("0", dt.averageGain(list2, m.attributes[0])) print("1", dt.averageGain(list2, m.attributes[1])) print("2", dt.averageGain(list2, m.attributes[2]))
def main(): # Assignement 1 print("Assignement 1") monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3] monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test] entropies = [dtree.entropy(monk) for monk in monks] print("*** Monk1 entropy: ", entropies[0]) print("*** Monk2 entropy: ", entropies[1]) print("*** Monk3 entropy: ", entropies[2]) # Assignement 3 print(" ") print("Assignement 3") attributes = monkdata.attributes info_gain1 = info_gain(monks[0], attributes) info_gain2 = info_gain(monks[1], attributes) info_gain3 = info_gain(monks[2], attributes) print("*** Monk1 information gain for attribute:", ['%.5f' % x for x in info_gain1]) print("*** Monk2 information gain for attribute:", ['%.5f' % x for x in info_gain2]) print("*** Monk3 information gain for attribute:", ['%.5f' % x for x in info_gain3]) # Assignement 5 print("") print("Assignement 5") print("*** Attribute:", np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset") print("*** Attribute:", np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset") print("*** Attribute:", np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset") print("***") max0 = np.argmax(info_gain1) # attribute of first split attributes_left = [ attrib for attrib in attributes if attrib != attributes[max0] ] print("*** 1) Attributes the next nodes should be tested on: ", attributes_left) # Attributes to split on in second step splits = [ np.argmax( info_gain(dtree.select(monks[0], attributes[max0], value), attributes)) + 1 for value in attributes[max0].values ] print("*** 2) Second split is on the attriburtes: ", splits) # Decision after second split subsets = [ dtree.select(monks[0], attributes[max0], split) for split in splits ] print("*** 3) Assignement after second split: ", [dtree.mostCommon(subset) for subset in subsets]) print("***") print("*** Train and test set errors") t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes) print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes) print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2), " Etest=", 1 - dtree.check(t2, monkdata.monk2test)) t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes) print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3), " Etest=", 1 - dtree.check(t3, monkdata.monk3test)) import drawtree_qt5 #print(t1) # tree in text form(weird) #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree # Assignement 7 print("") print("Assignement 7") # The prunning for the exanple of monk1 monk1train, monk1val = partition(monkdata.monk1, 0.9) t1 = dtree.buildTree(monk1train, monkdata.attributes) # tree trained from monk1train t11 = prune(t1, monk1val) # prunned tree print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=", 1 - dtree.check(t11, monkdata.monk1test)) # Statistic information for different fraction for monk1 and monk3 fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] # Evaluation of Monk1 eval1 = [ evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test) for frac in fraction ] means1 = [np.mean(x) for x in eval1] vars1 = [np.var(x) for x in eval1] plt.figure(1) plt.subplot(121) plt.plot(fraction, means1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk1') # Evaluation of Monk2 eval3 = [ evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test) for frac in fraction ] means3 = [np.mean(x) for x in eval3] vars3 = [np.var(x) for x in eval3] plt.figure(2) plt.subplot(121) plt.plot(fraction, means3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk2') plt.show()
ag1 = dtree.averageGain(mdata.monk1, x) ag2 = dtree.averageGain(mdata.monk2, x) ag3 = dtree.averageGain(mdata.monk3, x) print('Average gain in dataset monk1 and attribute ' + str(x.name) + ' is %.6f' % ag1) print("Average gain in dataset monk2 and attribute " + str(x.name) + " is %.6f" % ag2) print("Average gain in dataset monk3 and attribute " + str(x.name) + " is %.6f" % ag3) print("\n") for x in range(1, 5): highest_avg = 0 highest_attribute = 0 s = dtree.select(mdata.monk1, mdata.attributes[4], x) for y in mdata.attributes: avg_g = dtree.averageGain(s, y) print("Average gain in dataset monk1 and subset s" + str(x) + " and attribute " + str(y.name) + " is %.6f. Majority: " % avg_g + str(dtree.mostCommon(s))) if (avg_g > highest_avg): highest_avg = avg_g highest_attribute = int(y.name[1]) print("Highest avg: %.6f in attr: " % highest_avg + str(highest_attribute)) for z in range( 1, len(mdata.attributes[int(highest_attribute - 1)].values) + 1): s2 = dtree.select(s, mdata.attributes[int(highest_attribute - 1)], z) print(dtree.mostCommon(s2))
# Datasets train = [monk.monk1, monk.monk2, monk.monk3] test = [monk.monk1test, monk.monk2test, monk.monk3test] print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1))) print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2))) print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3))) for i, dataset in enumerate(train): print("") print("Average gain for monk{} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute))) monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)] for i, monk1 in enumerate(monk1a5): print("") print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): if j != 4: print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute))) print("Majority class = {}".format(dt.mostCommon(monk1))) # Building the decision tree. tree1 = dt.buildTree(monk.monk1, monk.attributes) tree2 = dt.buildTree(monk.monk2, monk.attributes) tree3 = dt.buildTree(monk.monk3, monk.attributes) trees = [tree1, tree2, tree3]
def bestAttribute(dataset, attributes): result = 0 best = attributes[0] for a in attributes: value = dt.averageGain(dataset, a) if value > result: result = value best = a return best #splitting the data a = bestAttribute(m.monk1, m.attributes) data = [] for v in a.values: data.append(dt.select(m.monk1, a, v)) #calculating the average information gain for the next level for d in data: for a in m.attributes: print dt.averageGain(d, a) print '\n' print '\n' #comparison with the tree from the predefined function tree = dt.buildTree(m.monk1, m.attributes, 2) #draw.drawTree(tree) #building the trees for all the monks datasets #assignment 3
info_gain_m2 = [] info_gain_m3 = [] attribute = [] #starting counter i = 0; #iterating over all the test sets for sets in [info_gain_m1, info_gain_m2, info_gain_m3]: #for all attributes in the sets, the average information gain is added to the list for k in range(6): attribute.append(dtree.averageGain(data_sets[i], m.attributes[k])); sets.append(attribute) attribute = [] i += 1; #print(info_gain_m1) #print(info_gain_m2) #print(info_gain_m3) # Assignment 3 # ################ selected = dtree.select(m.monk1, m.attributes[4], 1) t=dtree.buildTree(m.monk1, m.attributes); print(dtree.check(t, m.monk1test)) print(t)
d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5]) )) print("monk-2: %f %f %f %f %f %f" % ( d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]) )) print("monk-3: %f %f %f %f %f %f" % ( d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]), d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]), d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5]) )) monk1_subset = d.select(m.monk1, m.attributes[4], 3) print len(monk1_subset) print(d.mostCommon(monk1_subset)) monk1_subset_tree = d.buildTree(monk1_subset, m.attributes, 5) print(monk1_subset_tree) t1 = d.buildTree(m.monk1, m.attributes); print(d.check(t1, m.monk1test)) print(d.check(t1, m.monk1)) t2 = d.buildTree(m.monk2, m.attributes); print(d.check(t2, m.monk2test)) print(d.check(t2, m.monk2)) t3 = d.buildTree(m.monk3, m.attributes);
def splitDataset(dataset, attributeNumber): "Function to split an entire dataset on attributes" return [dtree.select (dataset, m.attributes[attributeNumber], x) for x in m.attributes[attributeNumber].values]
def splitOnAttribute(dataset, attribute, doneSplits): sets = [] for i in range(0, len(attribute.values)): sets.append(d.select(dataset, attribute, attribute.values[i])) return sets
print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0])) print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1])) print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2])) print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3])) print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4])) print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5])) print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0])) print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1])) print "Gain Monk3 a3: " + str(tree.averageGain(m.monk3,m.attributes[2])) print "Gain Monk3 a4: " + str(tree.averageGain(m.monk3,m.attributes[3])) print "Gain Monk3 a5: " + str(tree.averageGain(m.monk3,m.attributes[4])) print "Gain Monk3 a6: " + str(tree.averageGain(m.monk3,m.attributes[5])) print "Gain Level1 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 1),m.attributes[0])) print "Gain Level1 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 1),m.attributes[1])) print "Gain Level1 Monk1 a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[2], 1),m.attributes[2])) print "Gain Level1 Monk1 a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[3], 1),m.attributes[3])) print "Gain Level1 Monk1 a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[4])) print "Gain Level1 Monk1 a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[5], 1),m.attributes[5])) print "Gain Level2 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 2),m.attributes[0])) print "Gain Level2 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 2),m.attributes[1])) print "Gain Level2 Monk1 a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[2], 2),m.attributes[2])) print "Gain Level2 Monk1 a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[3], 2),m.attributes[3])) print "Gain Level2 Monk1 a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[4])) print "Gain Level2 Monk1 a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[5], 2),m.attributes[5])) print "Gain Level3 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 3),m.attributes[0])) print "Gain Level3 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 3),m.attributes[1]))
print("Monk3, attribute a3 has information gain: ", dt.averageGain(m.monk3, m.attributes[2])) print("Monk3, attribute a4 has information gain: ", dt.averageGain(m.monk3, m.attributes[3])) print("Monk3, attribute a5 has information gain: ", dt.averageGain(m.monk3, m.attributes[4])) print("Monk3, attribute a6 has information gain: ", dt.averageGain(m.monk3, m.attributes[5])) print("Monk3's best attribute is: ", dt.bestAttribute(m.monk3, m.attributes)) print("\n") #Calculate information gain for 2nd level in tree #Monk1 - a5 - 1 a5 = m.attributes[4] print("Monk1 - a5 - 1, a1 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[0])) print("Monk1 - a5 - 1, a2 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[1])) print("Monk1 - a5 - 1, a3 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[2])) print("Monk1 - a5 - 1, a4 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[3])) print("Monk1 - a5 - 1, a6 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[5])) print("\n") #Monk1 - a5 - 2 print("Monk1 - a5 - 2, a1 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[1]), m.attributes[0])) print("Monk1 - a5 - 2, a2 has info gain: ", dt.averageGain(dt.select(m.monk1, a5, a5.values[1]), m.attributes[1]))
gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x])) gain_monk2.append(dt.averageGain(m.monk2,m.attributes[x])) gain_monk3.append(dt.averageGain(m.monk3,m.attributes[x])) print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6" print "Monk1: ","\t".join(["%.7f"%y for y in gain_monk1]) print "Monk2: ","\t".join(["%.7f"%y for y in gain_monk2]) print "Monk3: ","\t".join(["%.7f"%y for y in gain_monk3]) print print "------------------------------" print "-------- Assignment 3 --------" print partition1 = dt.select(m.monk1,m.attributes[4],1) partition2 = dt.select(m.monk1,m.attributes[4],2) partition3 = dt.select(m.monk1,m.attributes[4],3) partition4 = dt.select(m.monk1,m.attributes[4],4) gain_partition1 = [] gain_partition2 = [] gain_partition3 = [] gain_partition4 = [] for x in range(0, 6): gain_partition1.append(dt.averageGain(partition1,m.attributes[x])) gain_partition2.append(dt.averageGain(partition2,m.attributes[x])) gain_partition3.append(dt.averageGain(partition3,m.attributes[x])) gain_partition4.append(dt.averageGain(partition4,m.attributes[x]))
def printNumTrueFalse(datasets): # For a list of datasets, print the number of true and false for i in range(0, len(datasets)): print("Monk"+str(i+1)+" "+ "[#tot="+str(len(datasets[i]))+"] "+ "[#true="+str(getNumTrue(datasets[i]))+"] "+ "[#false="+str(getNumFalse(datasets[i]))+"]") #Main dataset = m.monk2 available = [True]*len(m.attributes) firstSplit = getBestAttribute(dataset, m.attributes, available) print("Firstsplit = "+str(firstSplit)) print("-----") available[firstSplit] = False sets = [] for i in range(0, len(m.attributes[firstSplit].values)): sets.append(d.select(dataset, m.attributes[firstSplit], m.attributes[firstSplit].values[i])) for i in range(0, len(sets)): subSets = [] splitOn = getBestAttribute(sets[i], m.attributes, available) print("Second split = "+str(splitOn)) for j in range(0, len(m.attributes[splitOn].values)): subSets.append(d.select(sets[i], m.attributes[i], m.attributes[i].values[j])) for s in subSets: print(d.mostCommon(s)) print("----")
def main(): print ("Entropy monk1") entropy1 = tree.entropy(data.monk1) print (entropy1) print ("\n") print ("Entropy monk2") entropy2 = tree.entropy(data.monk2) print (entropy2) print ("\n") print ("Entropy monk3") entropy3 = tree.entropy(data.monk3) print (entropy3) print ("\n") informationGain(data) #COMPUTING ENTROPY FOR SUBSET, WhY 0?! monk1Tree = tree.buildTree(data.monk1, data.attributes) #draw.drawTree(monk1Tree) #print(tree.bestAttribute(data.monk3, data.attributes)) subSet = tree.select(data.monk1, data.attributes[4], 1) # newEntropy = tree.entropy(subSet) # print ("SubSet") # print (newEntropy) #END n = 0 sumList = np.array([0.0] * 6) l1 = [] l2 = [] l3 = [] l4 = [] l5 = [] l6 = [] for x in range(100): errorList = np.array(pruneTree(data.monk1, data.monk1test)) sumList += errorList l1.append(errorList[0]) l2.append(errorList[1]) l3.append(errorList[2]) l4.append(errorList[3]) l5.append(errorList[4]) l6.append(errorList[5]) finalList = sumList/100 stdDevList = [np.std(l1),np.std(l2),np.std(l3),np.std(l4), np.std(l5),np.std(l6)] print(finalList) print(stdDevList) line1, = plt.plot(finalList, label="Monk1 means", marker='o') # Create a legend for the first line. first_legend = plt.legend(handles=[line1], loc=1) x = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] # create an index for each tick position xi = [i for i in range(0, len(x))] plt.xticks(xi, x) plt.ylabel('Mean Errors') plt.xlabel('Fractions') plt.show()
"--Answer to Assignment 2" print(informationGain[2], "\n") # print(t.bestAttribute(m.monk1, m.attributes)) """ Attribute a5 has the largest information gain meaning that it reduces the uncertainty the most. Thus, it should be used for splitting at the root node. """ "5 BUILDING DECISION TREES" sel = [] for i in range(4): # splits data into subset according to attr a5 sel.append(t.select(m.monk1, m.attributes[4], m.attributes[4].values[i])) # print(sel) sub = [] mC = [] for subset in sel: for i in [0, 1, 2, 3, 5]: sub.append(t.averageGain(subset, m.attributes[i])) mC.append(t.mostCommon(subset)) # print(sub) sub = [] "Highest information gain on second level of the tree # 2 - A4 , 3 - A6 , 4 - A1 #" """Assignment 3"""
import monkdata as m import dtree as dtree foo = dtree.select(m.monk1, m.attributes[4], 3) print '-- information gain of monk-1 dataset: --' print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0])) print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1])) print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2])) print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3])) print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5])) foo = dtree.select(m.monk1, m.attributes[4], 1) print '-- is a_5 with value = 1 a majority class? --' print dtree.mostCommon(foo)
# entropy of subset is how predictable is an unlabeled record to be positive or negative and has attribute 1 # weighted is the sum of entropies for attribute 1 weighted += entropy(subset) * len(subset) # entropy(monk1) - entropy(subset) # the less entropy of subset is, the better it's at classifiying data. So, the higher the information gain is the better it's to use this attribute to split. return entropy(dataset) - weighted / len(dataset) def select(dataset, attribute, value): "Return subset of data samples where the attribute has the given value" return [x for x in dataset if x.attribute[attribute] == value] #test for how select function works for v in m.attributes[0].values: subset = dtree.select(m.monk1, m.attributes[0], v) #information Gain calculation Data = np.array([np.array(m.monk1), np.array(m.monk2), np.array(m.monk3)]) ''' Dictionary where keys are the IG values and values are the attribute names ''' IG = {} for i in range(3): for j in range(len(m.attributes)): IG[dtree.averageGain(Data[i], m.attributes[j])] = m.attributes[j].name #max IG maxIGKey = max(IG.keys()) #the chosen attribute that has the highest value of IG such as A5 in monk1 maxIGValue = IG[maxIGKey]
__author__ = 'swebo_000' import monkdata as m import dtree as d #import drawtree print("3. Building decision tree: \n") print("Subset division of MONK-1 at attribute 5: \n") subsets = []; for x in range(0, len(m.attributes[4].values)): subsets.append(d.select(m.monk1, m.attributes[4], x+1)) for set in subsets: gain = 0; maxgain = 0; bestatr = 0; print("Value: %d" % (subsets.index(set) + 1)) print("Most common: " + str(d.mostCommon(set))) for x in range(0, len(m.attributes)): gain = d.averageGain(set, m.attributes[x]); print("Attribute A%d: %f" % (x+1, gain)) if(gain > maxgain): maxgain = gain; bestatr = x; print("Attribute with best information gain: A%d \n" % (bestatr + 1)); maxgain = 0 bestatr = 0; print("MONK-1:")
def splittingHighestAttribute(): a5=m.attributes[4] for set in trainingset: for attibuteValues in a5.values: subset=d.select(set.dataset ,a5,attibuteValues) getAverageInformationGain(subset,set.name +"on A5 = "+str(attibuteValues))
# ag3.append(gain3) # print(ag1) #a5 # print(ag2) #a5 # print(ag3) #a2 a5 #********************************** # Assignment 5 a = dtree.bestAttribute(mdata.monk1, mdata.attributes) attributesLeft = [x for x in mdata.attributes if x != a] #print(a,attributesLeft) #a5 subsets = [] for v in a.values: temp = dtree.select(mdata.monk1, a, v) subsets.append(temp) ag_in2level = [] subsets_ag = [] #print(len(a.values)) for subset in subsets: for i in range(len(attributesLeft)): gain1 = dtree.averageGain(subset, attributesLeft[i]) ag_in2level.append(gain1) subsets_ag.append(ag_in2level) ag_in2level = [] #print(subsets_ag) def Tree(dataset, attributes, maxdepth=3):
# output print print '-------- Assignment 3 --------' print 'information gain:' print 'monk 1 (a1->a6): ' + str(np.transpose(Ga_m1)) print 'monk 2 (a1->a6): ' + str(np.transpose(Ga_m2)) print 'monk 3 (a1->a6): ' + str(np.transpose(Ga_m3)) print '' ## Assignment 5: build decision tree Ga_m11 = np.empty([6,4], dtype = float) en_m11 = np.empty([1,4], dtype = float) for i in range(0,4): en_m11[0,i] = d.entropy(d.select(m.monk1, m.attributes[4], (i+1))) for j in range(0,6): Ga_m11[j,i] = d.averageGain(d.select(m.monk1, m.attributes[4], (i+1)), m.attributes[j]) # majority class #mc = d.mostCommon(d.select(m.monk1, m.attributes[4], 1)) t1 = d.buildTree(m.monk1, m.attributes) t2 = d.buildTree(m.monk2, m.attributes) t3 = d.buildTree(m.monk3, m.attributes) # output print print '-------- Assignment 5 --------' print 'decision tree of monk1:' print(t1) print 'train set error: ' + str(round((1-d.check(t1, m.monk1))*100, 2)) + '%'
Ga_m3[i] = d.averageGain(m.monk3, m.attributes[i]) # output print print '-------- Assignment 3 --------' print 'information gain:' print 'monk 1 (a1->a6): ' + str(np.transpose(Ga_m1)) print 'monk 2 (a1->a6): ' + str(np.transpose(Ga_m2)) print 'monk 3 (a1->a6): ' + str(np.transpose(Ga_m3)) print '' ## Assignment 5: build decision tree Ga_m11 = np.empty([6, 4], dtype=float) for i in range(0, 4): for j in range(0, 6): Ga_m11[j, i] = d.averageGain(d.select(m.monk1, m.attributes[4], (i + 1)), m.attributes[j]) # majority class #mc = d.mostCommon(d.select(m.monk1, m.attributes[4], 1)) t1 = d.buildTree(m.monk1, m.attributes) t2 = d.buildTree(m.monk2, m.attributes) t3 = d.buildTree(m.monk3, m.attributes) # output print print '-------- Assignment 5 --------' print 'decision tree of monk1:' print(t1) print 'train set error: ' + str(round( (1 - d.check(t1, m.monk1)) * 100, 2)) + '%'
print(" ") print("Information gain for the MONK3 dataset") for i in range(0, 6): print(" Info Gain ", m.attributes[i], ":", d.averageGain(monk3, m.attributes[i])) print(" ") print("#----------------Assignment 5 ----------------#") print(" ") # Splitting the tree for MONK1 data print("#---- For MONK1 dataset -----#") for i in (1, 2, 3, 4): for j in range(0, 6): print("Information gain for split for A5 at value ", i, " at", m.attributes[j], " :", d.averageGain(d.select(monk1, A5, i), m.attributes[j])) print(" ") print(" ") # Build Tree using PyQT graph # MONK1 Tree #draw.drawTree(d.buildTree(monk1, m.attributes)) print("Classification error for dataset [in fraction]") print("MONK1 with train data ", (1 - d.check(d.buildTree(monk1, m.attributes), monk1))) print("MONK1 with test data ", (1 - d.check(d.buildTree(monk1, m.attributes), monktest1))) # MONK2 Tree #draw.drawTree(d.buildTree(monk2, m.attributes)) print("MONK2 with train data ",
for i in range(len(m.attributes)): if i not in exclude: attribute_key = "A" + str(i + 1) attribute = m.attributes[i] avg_gain = d.averageGain(dataset, attribute) avg_gain_dict[attribute_key] = avg_gain return avg_gain_dict d.selected_attribute = "A5" print( "\nAssignment 5.1 a) - Split monk1 into subsets according to selected attribute {}\n" .format(d.selected_attribute)) idx = int(d.selected_attribute[-1]) - 1 subset_A5_true = d.select(m.monk1, m.attributes[idx], True) subset_A12346 = [x for x in m.monk1 if x not in subset_A5_true] print("\nAssignment 5.1 b) - Where do we find the highest average gain?") IG_dict_A12346 = get_avg_gain_dict_exclude(subset_A12346, exclude=[idx]) IG_dict_A12346 = sorted(IG_dict_A12346.items(), key=lambda kv: kv[1], reverse=True) # print("\n", IG_dict_A12346) [print(tuple[0], ": ", tuple[1]) for tuple in IG_dict_A12346] print() d.selected_attribute = "A1" print( "\nAssignment 5.1 c) - Split into further subsets according to selected attribute {}\n" .format(d.selected_attribute))
import monkdata as m import dtree as dt import drawtree as draw entropy = dt.entropy(m.monk1) best_gain = 0 for attribute in m.attributes: gain = dt.averageGain(m.monk1, attribute) if gain > best_gain: best_gain = gain best_attribute = attribute for v in best_attribute.values: subset = dt.select(m.monk1, best_attribute, v) majority_class = dt.mostCommon(subset) values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values} print(best_attribute, values) draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))