def A2(): for att in m.attributes: #print att[ 0 ] print "Gain in Monk1 is : ", dT.averageGain( m.monk1, att ) print "Gain in Monk2 is : ", dT.averageGain( m.monk2, att ) print "Gain in Monk3 is : ", dT.averageGain( m.monk3, att ) print '\n'
def A2(): for att in m.attributes: #print att[ 0 ] print "Gain in Monk1 is : ", dT.averageGain(m.monk1, att) print "Gain in Monk2 is : ", dT.averageGain(m.monk2, att) print "Gain in Monk3 is : ", dT.averageGain(m.monk3, att) print '\n'
def assignment2(): print("Assignment 2") l = [[d.averageGain(m.monk1, a) for a in m.attributes], [d.averageGain(m.monk2, a) for a in m.attributes], [d.averageGain(m.monk3, a) for a in m.attributes]] for i in range(0, 3): # print("MONK-%d: %f %f %f %f %f %f" % tuple([i + 1] + l[i]), end=" | ") print("Split on: a%d" % (max(enumerate(l[i]), key=lambda p: p[1])[0] + 1))
def assignment3(): monk1 = ['monk1'] monk2 = ['monk2'] monk3 = ['monk3'] for j in range (0,6): monk1.append(round(tree.averageGain(m.monk1,m.attributes[j]),5)) monk2.append(round(tree.averageGain(m.monk2,m.attributes[j]),5)) monk3.append(round(tree.averageGain(m.monk3,m.attributes[j]),5)) print(tabulate([monk1, monk2,monk3], headers=['Dataset', 'a1','a2','a3','a4','a5','a6'], tablefmt='orgtbl'))
def getBestAttribute(dataset, attributes, available): # Returns the index of the best attribute to split on in attributes[] mostGain = -1; bestAttribute = None; for i in range(0, len(attributes)): if available[i] and d.averageGain(dataset, attributes[i]) > mostGain: mostGain = d.averageGain(dataset, attributes[i]) bestAttribute = i return bestAttribute
def getBestAttribute(dataset, attributes, available): # Returns the index of the best attribute to split on in attributes[] mostGain = -1 bestAttribute = None for i in range(0, len(attributes)): if available[i] and d.averageGain(dataset, attributes[i]) > mostGain: mostGain = d.averageGain(dataset, attributes[i]) bestAttribute = i return bestAttribute
def gain(): monk1_gains = []; monk2_gains = []; monk3_gains = []; for i in range(0,6): monk1_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk1,mdata.attributes[i]))) monk2_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk2,mdata.attributes[i]))) monk3_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk3,mdata.attributes[i]))) return [monk1_gains,monk2_gains,monk3_gains]
def assignment3(): gains1 = [d.averageGain(m.monk1, m.attributes[i]) for i in range(0, 6)] print("The gains of entropy on the first dataset per attribute is %s" % gains1) gains2 = [d.averageGain(m.monk2, m.attributes[i]) for i in range(0, 6)] print("The gains of entropy on the second dataset per attribute is %s" % gains2) gains3 = [d.averageGain(m.monk3, m.attributes[i]) for i in range(0, 6)] print("The gains of entropy on the third dataset per attribute is %s" % gains3)
def calc_gain(): print "\n------------------------------\nAssignment 2 - Average gain\n------------------------------" i = 1 print "Dataset\t a1\t\t a2\t\t a3\t\t a4\t\t a5\t\t a6" s = "Monk1\t" for attr in data.attributes: s = s + "%.6f\t" % (dt.averageGain(data.monk1, attr)) print s s = "Monk2\t" for attr in data.attributes: s = s + "%.6f\t" % (dt.averageGain(data.monk2, attr)) print s s = "Monk3\t" for attr in data.attributes: s = s + "%.6f\t" % (dt.averageGain(data.monk3, attr)) print s
def printGains(datasets, attributes): # Print GAIN for all datasets for i in range(0, len(datasets)): for j in range(0, len(attributes)): print("Gain monk" + str(i + 1) + ", a" + str(j + 1) + ": " + str(d.averageGain(datasets[i], attributes[j]))) print("---------------")
def infoGainOnSubsets(subsets): l = [] # List containing subsets depending on what choice was made # in previous node for sset in subsets: attribute_gains = [] j = 0 while (j < len(m.attributes)): attribute_gains.append(dtree.averageGain(sset, m.attributes[j])) j+=1 l.append(attribute_gains) result = [0] * len(m.attributes) i = 0 while i < len(l): j = 0 while j < len(m.attributes): result[j] += (l[i][j]) #print((l[i][j])) j+=1 i+=1 """ Summing over the information gains given by the attributes in different nodes on level two shows that attribute 1 gives the best information gain """ print(result) return
def get_avg_gain_dict(dataset): avg_gain_dict = dict() for i in range(len(m.attributes)): attribute_key = "A" + str(i + 1) attribute = m.attributes[i] avg_gain = d.averageGain(dataset, attribute) avg_gain_dict[attribute_key] = avg_gain return avg_gain_dict
def information_gain(datasets): attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6'] information_gain_matrix = np.zeros((len(datasets), len(m.attributes))) for idx, dataset in enumerate(datasets): for i in range(len(attributes_names)): average_gain = round(d.averageGain(dataset, m.attributes[i]), 4) information_gain_matrix[idx, i] = average_gain return information_gain_matrix
def print_average_gain(datasets, attributes): # Calculate average gain for datasets print("Information gain") for data in datasets: print("For " + data + " the:") for attribute in attributes: print("information gain for " + attribute + " was: " + str(d.averageGain(datasets[data], attributes[attribute])))
def informationGainCalculation(): print("Information gain results ", "\n") for attributeIndex in range(0, 6): result = d.averageGain(m.monk1, m.attributes[attributeIndex]) print("Monk1| ", attributeIndex + 1, ": ", result, " ") print("Best attribute: ", d.bestAttribute(m.monk1, m.attributes), "\n") for attributeIndex in range(0, 6): result = d.averageGain(m.monk2, m.attributes[attributeIndex]) print("Monk2| ", attributeIndex + 1, ": ", result, " ") print("Best attribute: ", d.bestAttribute(m.monk2, m.attributes), "\n") for attributeIndex in range(0, 6): result = d.averageGain(m.monk3, m.attributes[attributeIndex]) print("Monk3| ", attributeIndex + 1, ": ", result, " ") print("Best attribute: ", d.bestAttribute(m.monk3, m.attributes), "\n")
def bestAttribute(dataset, attributes): result = 0 best = attributes[0] for a in attributes: value = dt.averageGain(dataset, a) if value > result: result = value best = a return best
def chosenNode(DataSet, attributes): IG = {} for j in range(len(attributes)): IG[dtree.averageGain(DataSet, attributes[j])] = attributes[j].name #max IG maxIGKey = max(IG.keys()) #the chosen attribute that has the highest value of IG such as A5 in monk1 maxIGValue = IG[maxIGKey] #print('chosen node: %s'%(maxIGValue)) return maxIGValue
def get_avg_gain_dict_exclude(dataset, exclude=[]): avg_gain_dict = dict() for i in range(len(m.attributes)): if i not in exclude: attribute_key = "A" + str(i + 1) attribute = m.attributes[i] avg_gain = d.averageGain(dataset, attribute) avg_gain_dict[attribute_key] = avg_gain return avg_gain_dict
def assignment_3(): i = 0 k = 0 while i <= 5: f = dtree.averageGain(m.monk1, m.attributes[i]) k += f print(i + 1, rnd(f, 8)) i += 1 print(k)
def assignment2(): print "--- Assignment 2 ---" print "Selecting the root of the decision tree" table = Texttable(max_width=100) table.add_row(["Dataset", "a1", "a2", "a3", "a4", "a5", "a6"]) for i in range(3): gains = map(lambda att: d.averageGain(monkdata[i],att), m.attributes) table.add_row(["Monk-" + str(i+1)] + gains) print table.draw() print
def optimal_attr_split(data, attributes): maxGain = -1 maxGainInd = 0 for attr in range(0,len(attributes)): gain = dtree.averageGain(data, attributes[attr]) print("Attribute " + str(attr + 1) + " information gain: " + str(gain)) if (maxGain < gain): maxGain = gain maxGainInd = attr print("Best split attribute is attribute " + str(maxGainInd + 1)) return maxGainInd
def informationGain(data): a1 = tree.averageGain(data.monk3, data.attributes[0]) a2 = tree.averageGain(data.monk3, data.attributes[1]) a3 = tree.averageGain(data.monk3, data.attributes[2]) a4 = tree.averageGain(data.monk3, data.attributes[3]) a5 = tree.averageGain(data.monk3, data.attributes[4]) a6 = tree.averageGain(data.monk3, data.attributes[5])
def getMaxGain(dataset, attributes): gains = [(a, dtree.averageGain(dataset, a)) for a in attributes] g = [str(round(b, 5)) for a, b in gains] print(" & ".join(g)) attribute_max = max(gains, key=lambda x: x[1]) print("Max gain on attribute", attribute_max) return attribute_max
def compute_gain(): print ("Compute information gain of attributes in training datasets:") ig_table = PrettyTable(['Dataset', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6']) for i in range(3): l = ["MONK-{0}".format(i+1)] for k in range(6): l.append(round(dt.averageGain(monks[i], m.attributes[k]), 10)) ig_table.add_row(l) print(ig_table) print ()
def find_information_gain(data_set, attributes): information_gain = [ d.averageGain(data_set, attributes[0]), d.averageGain(data_set, attributes[1]), d.averageGain(data_set, attributes[2]), d.averageGain(data_set, attributes[3]), d.averageGain(data_set, attributes[4]), d.averageGain(data_set, attributes[5]) ] return information_gain
def calc_next_level(): #print "\nAverage gain when a5 is choosen" print "\nA5\t a1\t\t a2\t\t a3\t\t a4\t\t a5\t\t a6" s = "A5(" for val in data.attributes[4].values: subset = dt.select(data.monk1, data.attributes[4], val) t = "\t" for attr in data.attributes: t = t + "%.6f\t" % (dt.averageGain(subset, attr)) print val , t best = dt.bestAttribute(subset, data.attributes) s = s + best.name + "(" #print "best attribute: ", best.name for value in best.values: #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value)) if(dt.mostCommon(dt.select(subset, best, value))): s = s + "+" else: s = s + "-" s = s + ")" s = s + ")" print "\nOur tree:\t", s print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
print("3. Building decision tree: \n") print("Subset division of MONK-1 at attribute 5: \n") subsets = []; for x in range(0, len(m.attributes[4].values)): subsets.append(d.select(m.monk1, m.attributes[4], x+1)) for set in subsets: gain = 0; maxgain = 0; bestatr = 0; print("Value: %d" % (subsets.index(set) + 1)) print("Most common: " + str(d.mostCommon(set))) for x in range(0, len(m.attributes)): gain = d.averageGain(set, m.attributes[x]); print("Attribute A%d: %f" % (x+1, gain)) if(gain > maxgain): maxgain = gain; bestatr = x; print("Attribute with best information gain: A%d \n" % (bestatr + 1)); maxgain = 0 bestatr = 0; print("MONK-1:") t = d.buildTree(m.monk1, m.attributes) print("Testing set error %f: " % (1 - d.check(t, m.monk1test))) print("Training set error %f: \n" % (1 - d.check(t, m.monk1))) print("MONK-2:")
import dtree import drawtree_qt5 as draw import numpy as np import matplotlib.pyplot as plt import random entropyMonk1 = dtree.entropy(m.monk1) entropyMonk2 = dtree.entropy(m.monk2) entropyMonk3 = dtree.entropy(m.monk3) print(f'Entropy for monk1: {entropyMonk1}') print(f'Entropy for monk2: {entropyMonk2}') print(f'Entropy for monk3: {entropyMonk3}') informationGainMonk1 = list( map(lambda x: dtree.averageGain(m.monk1, x), m.attributes)) informationGainMonk2 = list( map(lambda x: dtree.averageGain(m.monk2, x), m.attributes)) informationGainMonk3 = list( map(lambda x: dtree.averageGain(m.monk3, x), m.attributes)) print( f'Information gain for all 6 attuributes for monk1: {informationGainMonk1}' ) print( f'Information gain for all 6 attuributes for monk2: {informationGainMonk2}' ) print( f'Information gain for all 6 attuributes for monk3: {informationGainMonk3}' )
def ASSIGNMENT3(dataset): for idx, attribute in enumerate(m.attributes): ag = dtree.averageGain(dataset, attribute) print("Average gain of a{:d}: {:f}".format(idx + 1, ag))
# Needed import for drawing the decision tree. #import drawtree as drawtree # Datasets train = [monk.monk1, monk.monk2, monk.monk3] test = [monk.monk1test, monk.monk2test, monk.monk3test] print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1))) print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2))) print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3))) for i, dataset in enumerate(train): print("") print("Average gain for monk{} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute))) monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)] for i, monk1 in enumerate(monk1a5): print("") print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): if j != 4: print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute))) print("Majority class = {}".format(dt.mostCommon(monk1))) # Building the decision tree. tree1 = dt.buildTree(monk.monk1, monk.attributes) tree2 = dt.buildTree(monk.monk2, monk.attributes)
def avgGain(datasets): return [[dtree.averageGain(d, a) for a in m.attributes] for d in datasets]
def infGain(s): for x in range(0, 6): print "a", x + 1, " ", d.averageGain(s, m.attributes[x])
import monkdata as m import dtree as dtree foo = dtree.select(m.monk1, m.attributes[4], 3) print '-- information gain of monk-1 dataset: --' print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0])) print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1])) print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2])) print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3])) print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5])) foo = dtree.select(m.monk1, m.attributes[4], 1) print '-- is a_5 with value = 1 a majority class? --' print dtree.mostCommon(foo)
def printGain(dataset, attributes): for j in range(0,len(attributes)): print("Gain a"+str(j+1)+": "+str(d.averageGain(dataset, attributes[j])))
monkEntropy = [round(t.entropy(m.monk1), 5), round(t.entropy(m.monk2), 5), round(t.entropy(m.monk3), 5)] "--Answer to Assignment 1" print(monkEntropy, "\n") "4 INFORMATION GAIN" "--Assignment 2" monkTrainingSets = [m.monk1, m.monk2, m.monk3] informationGain = [] print("Assignment 2: Expected information gains") att = [] # save values for each attribute for monk in monkTrainingSets: # for each data set for attribute in m.attributes: # for every attribute # calculate the gain of splitting by the attribute att.append(round(t.averageGain(monk, attribute), 5)) informationGain.append(att) # save a "row vector" att = [] "--Answer to Assignment 2" print(informationGain[2], "\n") # print(t.bestAttribute(m.monk1, m.attributes)) """ Attribute a5 has the largest information gain meaning that it reduces the uncertainty the most. Thus, it should be used for splitting at the root node. """
def assignment2(): print("Average Gain for ", "Monk-1") print( d.averageGain(m.monk1, m.attributes[0]), d.averageGain(m.monk1, m.attributes[1]), d.averageGain(m.monk1, m.attributes[2]), d.averageGain(m.monk1, m.attributes[3]), d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5]), ) print("Average Gain for ", "Monk-2") print( d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]), ) print("Average Gain for ", "Monk-3") print( d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]), d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]), d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5]), )
def myBuildTree(dataset, levels): treeLevels = [] splits = [] treeLevels.append(dataset) datasubsets = dataset datasubsetsAvgGains = [] for level in range(0, levels): print("\n===Level #: ", level) if level == 0: attribAvgGains = [] largestGain = 0 largestAttribIndex = 0 if len(datasubsets) > 5: for attribute in range(0, len(m.attributes)): avgGain = d.averageGain(datasubsets, m.attributes[attribute]) if avgGain > largestGain: largestGain = avgGain largestAttribIndex = attribute attribAvgGains.append(avgGain) print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain) datasubsetsAvgGains.append(attribAvgGains) print("---Splitting at attribute: ", m.attributes[largestAttribIndex]) datasubsets = split(datasubsets, m.attributes[largestAttribIndex]) splits.append(m.attributes[largestAttribIndex]) treeLevels.append(datasubsets) elif level > 0: print("---No. of datasets: ", len(datasubsets)) newdatasubsets = [] for i in range(0, len(datasubsets)): print("\n---Datasubset: ", i, "\t\tEntropy: ", d.entropy(datasubsets[i])) attribAvgGains = [] newdatasubsets = [] largestGain = 0 largestAttribIndex = 0 if len(datasubsets[i]) > 5: for attribute in range(0, len(m.attributes)): avgGain = d.averageGain(datasubsets[i], m.attributes[attribute]) if avgGain > largestGain: largestGain = avgGain largestAttribIndex = attribute attribAvgGains.append(avgGain) print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain) if avgGain > 0: print("---Splitting at attribute: ", m.attributes[largestAttribIndex].name) newdatasubsets.append(split(datasubsets[i], m.attributes[largestAttribIndex])) splits.append(m.attributes[largestAttribIndex]) else: print( "---Skipping splitting at attribute: ", m.attributes[largestAttribIndex].name, "Dataset #", i, ) datasubsetsAvgGains.append(attribAvgGains) if len(newdatasubsets[0]) > 1: datasubsets = newdatasubsets[0] print("---No. of New datasets: ", len(datasubsets)) treeLevels.append(datasubsets) return splits
__author__ = 'swebo_000' import monkdata as m import dtree as d #import drawtree monkset = [m.monk1, m.monk2, m.monk3] print("1. Entropy of the MONK datasets:") for x in range(0, len(monkset)): print("MONK-%d: %f" % (x+1, d.entropy(monkset[x]))) print(); print("2. Information gain from attributes:") for set in monkset: print("MONK-%d" % (monkset.index(set) + 1)) for x in range(0, len(m.attributes)): print("Attribute %d: %f" %(x+1, d.averageGain(set, m.attributes[x]))) print()
pruned_trees_performance = [0 for x in range(len(pruned_trees))] for candidate in pruned_trees: index = pruned_trees.index(candidate) pruned_trees_performance[index] = d.check(candidate, validation) if d.check(tree, validation) <= max(pruned_trees_performance): tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))] tree = prune_tree(tree, validation) return tree print(d.entropy(m.monk1)) print(d.entropy(m.monk2)) print(d.entropy(m.monk3)) print("\n") print("monk-1: %f %f %f %f %f %f" % ( d.averageGain(m.monk1, m.attributes[0]), d.averageGain(m.monk1, m.attributes[1]), d.averageGain(m.monk1, m.attributes[2]), d.averageGain(m.monk1, m.attributes[3]), d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5]) )) print("monk-2: %f %f %f %f %f %f" % ( d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]) )) print("monk-3: %f %f %f %f %f %f" % ( d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]), d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]), d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5]) ))
def getClasification(dataset,fraction): monk1train, monk1val = partition(dataset,fraction) testTree = tree.buildTree(monk1val,m.attributes) prunedTrees = tree.allPruned(testTree) pValue = 0 for pruned in prunedTrees: if(tree.check(pruned,monk1train) > pValue): bestTree = pruned pValue = tree.check(pruned,monk1train) return pValue, bestTree print "Entropy Monk1: " + str(tree.entropy(m.monk1)) print "Entropy Monk2: " + str(tree.entropy(m.monk2)) print "Entropy Monk3: " + str(tree.entropy(m.monk3)) print "Gain Monk1 a1: " + str(tree.averageGain(m.monk1,m.attributes[0])) print "Gain Monk1 a2: " + str(tree.averageGain(m.monk1,m.attributes[1])) print "Gain Monk1 a3: " + str(tree.averageGain(m.monk1,m.attributes[2])) print "Gain Monk1 a4: " + str(tree.averageGain(m.monk1,m.attributes[3])) print "Gain Monk1 a5: " + str(tree.averageGain(m.monk1,m.attributes[4])) print "Gain Monk1 a6: " + str(tree.averageGain(m.monk1,m.attributes[5])) print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0])) print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1])) print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2])) print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3])) print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4])) print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5])) print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0])) print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1]))
#!/usr/bin/env python import dtree as d import monkdata as m monkset = [m.monk1, m.monk2, m.monk3] mtrain = [m.monk1test, m.monk2test, m.monk3test] #Assignement 1 print 'Entropy for monk1-3' j = 1 for monk in monkset: #s = '\ta' + str(j++) + ': ' + str(d.entropy(monk)) print d.entropy(monk) #Assignement 2 attributes = [0, 0, 0] print '\nInformation gain for attributes a1 to a6' for i in range(0, len(monkset)): print 'Monk', i+1 s = "" greatest = 0 for x in range(0, 6): averageGain = d.averageGain(monkset[i], m.attributes[x]) if averageGain > greatest: greatest = averageGain s = s + str(averageGain)+ ' ' print s attributes[i] = greatest
def printGains(datasets, attributes): # Print GAIN for all datasets for i in range(0, len(datasets)): for j in range(0,len(attributes)): print("Gain monk"+str(i+1)+", a"+str(j+1)+": "+str(d.averageGain(datasets[i], attributes[j]))) print("---------------")
#Entorpy #calling the predefined function that calculates the entropy for all the three datasets #assignment 1 print dt.entropy(m.monk1) print dt.entropy(m.monk2) print dt.entropy(m.monk3) print '\n' ############################################################################## #Information Gain #cycles for calling average gains for all the three datasets and for every attribute #assignment 2 for atr in m.attributes: gain = dt.averageGain(m.monk1, atr) print gain print '\n' for atr in m.attributes: print dt.averageGain(m.monk2, atr) print '\n' for atr in m.attributes: print dt.averageGain(m.monk3, atr) print '\n' ############################################################################# #Building decision trees
def calcgainforset(dataset, name): for attr in m.attributes: print("Information Gain %s %s %f" % (name, attr.name, d.averageGain(dataset, attr))) print()
#Assignment 1 calculatee enropy print("") m1e = dt.entropy(m.monk1) m2e = dt.entropy(m.monk2) m3e = dt.entropy(m.monk3) print("Monk1 entropy: ", m1e) print("Monk2 entropy: ", m2e) print("Monk3 entropy: ", m3e) print("") M1 = [] M2 = [None]*6 M3 = [None]*6 for i in range(6): M1.append(dt.averageGain(m.monk1, m.attributes[i])) M2[i] = dt.averageGain(m.monk2, m.attributes[i]) M3[i] = dt.averageGain(m.monk3, m.attributes[i]) print("\ta1: \ta2: \ta3: \ta4: \ta5: \ta6:") print_average_gains(M1, "M1") print_average_gains(M2, "M2") print_average_gains(M3, "M3") #t = dt.buildTree(m.monk1, m.attributes) #dw.drawtree(dt.check(t, m.monk1test)) #print(dt.check(t, m.monk1test))
def calculateGainTuplesForAllAttributes(dataset, attributes): gainlist = [d.averageGain(dataset, m.attributes[i]) for i in range(attributes.__len__())] return zip(gainlist, range(attributes.__len__()))
import matplotlib.pyplot as plot sets = [monkdata.monk1, monkdata.monk2, monkdata.monk3] entropies = [dtree.entropy(s) for s in sets] def printlines(values): for line in values: print(', '.join(map(str, line))) print("Initial entropies:") print(entropies) print("") gain = [[dtree.averageGain(s, attr) for attr in monkdata.attributes] for s in sets] print("Expected gain:") printlines(gain) print("") def tests(pair): tree=dtree.buildTree(pair[0], monkdata.attributes) return [ pair[2], dtree.check(tree,pair[0]), dtree.check(tree,pair[1]) ] setpairs = [
t = d.buildTree(m.monk1, m.attributes) print(d.check(t, m.monk1)) print(d.check(t, m.monk1test)) t = d.buildTree(m.monk2, m.attributes) print(d.check(t, m.monk2)) print(d.check(t, m.monk2test)) t = d.buildTree(m.monk3, m.attributes) print(d.check(t, m.monk3)) print(d.check(t, m.monk3test)) print("First Node IG") for i in range(0, 6): print(d.averageGain(m.monk1, m.attributes[i])) a5_1 = d.select(m.monk1, m.attributes[4], 1) a5_2 = d.select(m.monk1, m.attributes[4], 2) a5_3 = d.select(m.monk1, m.attributes[4], 3) a5_4 = d.select(m.monk1, m.attributes[4], 4) print("subset a5_1 IG") for i in range(6): print(d.averageGain(a5_1, m.attributes[i])) print("subset a5_2 IG") for i in range(6): print(d.averageGain(a5_2, m.attributes[i])) print("subset a5_3 IG")
accuracy = d.check(final_tree, test_data) #print("Accuracy for Monk1.test", accuracy) return accuracy print(d.entropy(m.monk1)) print(d.entropy(m.monk2)) print(d.entropy(m.monk3)) #Printout the entropy of all datasets. a = list() b = list() c = list() for i in range(0, 6, 1): a.append(d.averageGain(m.monk1, m.attributes[i])) for i in range(0, 6, 1): b.append(d.averageGain(m.monk2, m.attributes[i])) for i in range(0, 6, 1): c.append(d.averageGain(m.monk3, m.attributes[i])) print(a) print(b) print(c) # #Calculate and printout the information get for all properties and datasets. # #r = d.select(m.monk1, m.attributes[1], 2) #for x in r: # print(x.attribute, "Positive:", x.positive)
################ #Setting up lists info_gain_m1 = [] info_gain_m2 = [] info_gain_m3 = [] attribute = [] #starting counter i = 0; #iterating over all the test sets for sets in [info_gain_m1, info_gain_m2, info_gain_m3]: #for all attributes in the sets, the average information gain is added to the list for k in range(6): attribute.append(dtree.averageGain(data_sets[i], m.attributes[k])); sets.append(attribute) attribute = [] i += 1; #print(info_gain_m1) #print(info_gain_m2) #print(info_gain_m3) # Assignment 3 # ################ selected = dtree.select(m.monk1, m.attributes[4], 1) t=dtree.buildTree(m.monk1, m.attributes);
print "Monk1 entropy: ", init_entropy_monk1 print "Monk2 entropy: ", init_entropy_monk2 print "Monk3 entropy: ", init_entropy_monk3 print print "------------------------------" print "-------- Assignment 2 --------" print gain_monk1 = [] gain_monk2 = [] gain_monk3 = [] for x in range(0, 6): gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x])) gain_monk2.append(dt.averageGain(m.monk2,m.attributes[x])) gain_monk3.append(dt.averageGain(m.monk3,m.attributes[x])) print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6" print "Monk1: ","\t".join(["%.7f"%y for y in gain_monk1]) print "Monk2: ","\t".join(["%.7f"%y for y in gain_monk2]) print "Monk3: ","\t".join(["%.7f"%y for y in gain_monk3]) print print "------------------------------" print "-------- Assignment 3 --------" print partition1 = dt.select(m.monk1,m.attributes[4],1)
a = dtree.bestAttribute(mdata.monk1, mdata.attributes) attributesLeft = [x for x in mdata.attributes if x != a] #print(a,attributesLeft) #a5 subsets = [] for v in a.values: temp = dtree.select(mdata.monk1, a, v) subsets.append(temp) ag_in2level = [] subsets_ag = [] #print(len(a.values)) for subset in subsets: for i in range(len(attributesLeft)): gain1 = dtree.averageGain(subset, attributesLeft[i]) ag_in2level.append(gain1) subsets_ag.append(ag_in2level) ag_in2level = [] #print(subsets_ag) def Tree(dataset, attributes, maxdepth=3): def Branch(dataset, default, attributes): if not dataset: return dtree.TreeLeaf(default) if dtree.allPositive(dataset): return dtree.TreeLeaf(True) if dtree.allNegative(dataset): return dtree.TreeLeaf(False) return Tree(dataset, attributes, maxdepth - 1)
def avgForDataset(dataset): res = ""; for attr in m.attributes: res += "{:.5f}".format(d.averageGain(dataset, attr)) res += " " return res
def printAverageGain(s, dataset): for x in range(0, 6): s = s + str(dt.averageGain(dataset, m.attributes[x])) + " " print(s) print("Average gain\n")
import monkdata as m from dtree import entropy from dtree import averageGain print(entropy(m.monk1), 'monk1') print(entropy(m.monk2), 'monk2') print(entropy(m.monk3), 'monk3') for i in range(6): print("\nattribute ", i) print(averageGain(m.monk1, m.attributes[i])) print(averageGain(m.monk2, m.attributes[i])) print(averageGain(m.monk3, m.attributes[i]))
import monkdata as m import dtree as dtree datasets = [{ 'name': 'monk1', 'ref': m.monk1 }, { 'name': 'monk2', 'ref': m.monk2 }, { 'name': 'monk3', 'ref': m.monk3 }] print "Sample class: " + m.Sample.__doc__ # TODO: # print dtree.averageGain(m.monk1, m.attributes) for dataset in datasets: print " Dataset: " + dataset['name'] i = 0 for attribute in m.attributes: print "a%s - %s" % (i, dtree.averageGain(dataset['ref'], attribute)) i = i + 1
print "monk3 "+ str(d.entropy(m.monk3)) print "" print "" #finding information average gain print " Assignment2" att =[x for x in m.attributes] monkar = [m.monk1, m.monk2, m.monk3] for j in monkar: entropyGain=[] for i in att : entropyGain.append(d.averageGain(j,i)) for i in entropyGain: print i print "The attribute used for splitting in data set MONK%d is A%d which has an entropy of %f"%(monkar.index(j)+1,entropyGain.index(max(entropyGain))+1, max(entropyGain)) print "" print "" #Building decision Trees print "Assignment 3" k=0 print " "+ "e-train " +"e-test" onkar=[m.monk1test, m.monk2test, m.monk3test] for j, i in zip(monkar, onkar): k=k+1