def calc_entropy(): ent1 = dt.entropy(data.monk1) ent2 = dt.entropy(data.monk2) ent3 = dt.entropy(data.monk3) print "\n------------------------------\nAssignment 1 - Entropy\n------------------------------" print "Dataset\tEntropy" print "Monk1\t%.6f\nMonk2\t%.6f\nMonk3\t%.6f" % (ent1, ent2, ent3)
def calc_entropy(): entropy1 = entropy(monk1) entropy2 = entropy(monk2) entropy3 = entropy(monk3) print('entropy of monk1', entropy1) print('entropy of monk2', entropy2) print('entropy of monk3', entropy3)
def assignment1(): monk1 = ['MONK1'] monk1.append(round(tree.entropy(m.monk1),5)) monk2 = ['MONK2'] monk2.append(round(tree.entropy(m.monk2),5)) monk3 = ['MONK3'] monk3.append(round(tree.entropy(m.monk3),5)) print(tabulate([monk1, monk2,monk3], headers=['Dataset', 'Entropy'], tablefmt='orgtbl'),'\n')
def ASSIGNMENT1(): e1 = dtree.entropy(m.monk1) e2 = dtree.entropy(m.monk2) e3 = dtree.entropy(m.monk3) print("Entropy of MONK-1 Training Set:", e1) print("Entropy of MONK-2 Training Set:", e2) print("Entropy of MONK-3 Training Set:", e3)
def assignment1(): entropy1 = d.entropy(m.monk1) entropy2 = d.entropy(m.monk2) entropy3 = d.entropy(m.monk3) print("Entropy of first dataset is %.6f" % entropy1) print("Entropy of second dataset is %.6f" % entropy2) print("Entropy of third dataset is %.6f" % entropy3)
def calculateEntropy(): entropy = [ d.entropy(m.monk1test), d.entropy(m.monk2test), d.entropy(m.monk3test) ] print(entropy) return entropy
def entropyCalculation(): print() print("Entropy Results", "\n") monk1Entropy = d.entropy(m.monk1) print("MONK1: ", monk1Entropy, "\n") monk2Entropy = d.entropy(m.monk2) print("MONK2: ", monk2Entropy, "\n") monk3Entropy = d.entropy(m.monk3) print("MONK3: ", monk3Entropy, "\n") print()
def monkEntropyAndInfoGain(monks): i = 0 while (i < len(monks)): print("Entropy of dataset for Monk-" + str(i+1) + " = " + str(dtree.entropy(monks[i]))) j = 0 while (j < len(m.attributes)): print("For Monk-" + str(i+1) +". Information gain on attribute " + str(j+1) + " = " + str(dtree.entropy(monks[i]))) j+=1 i+=1 print("\n") return
def assignment1(): print("Assignment 1") print("MONK-1: %f\nMONK-2: %f\nMONK-3: %f" % tuple([d.entropy(x) for x in (m.monk1, m.monk2, m.monk3)])) print( "Note: The impurity of MONK-1 is 1. This can only happen when there is an equal amount of true and false samples in the data." )
def entropy_matrix(datasets, attribute_index, max_att_list): entropy_matrix = np.zeros( (len(datasets), len(m.attributes[attribute_index].values))) for idx, dataset in enumerate(datasets): att = m.attributes[max_att_list[idx]] for j, v in enumerate(att.values): entropy_matrix[idx, j] = d.entropy(d.select(dataset, att, v)) print(entropy_matrix)
def assignment1(): print "--- Assignment 1 ---" print "Initial entropy of the datasets" table = Texttable(max_width=100) table.add_row(["Dataset","Entropy"]) for i in range(3): table.add_row(["Monk-" + str(i+1), d.entropy(monkdata[i])]) print table.draw() print
def assignment4(): datasets = [ (m.monk1, 'monk1', m.attributes[0]), (m.monk1, 'monk1', m.attributes[1]), (m.monk1, 'monk1', m.attributes[2]), (m.monk1, 'monk1', m.attributes[3]), (m.monk1, 'monk1 max', m.attributes[4]), ] for data, name, attribute in datasets: summ = 0 for value in attribute.values: subset = dtree.select(data, attribute, value) print(f'Entropy of S{value} for {name}:\t{dtree.entropy(subset)}') summ += len(subset) / len(data) * dtree.entropy(subset) print(dtree.entropy(data) - summ) print()
def main(argv): print "Entropy Monk1: " + str(tree.entropy(m.monk1)) print "Entropy Monk2: " + str(tree.entropy(m.monk2)) print "Entropy Monk3: " + str(tree.entropy(m.monk3)) print "Average Gain Monk1(a1): " + str(tree.averageGain(m.monk1, m.attributes[0])) print "Average Gain Monk1(a2): " + str(tree.averageGain(m.monk1, m.attributes[1])) print "Average Gain Monk1(a3): " + str(tree.averageGain(m.monk1, m.attributes[2])) print "Average Gain Monk1(a4): " + str(tree.averageGain(m.monk1, m.attributes[3])) print "Average Gain Monk1(a5): " + str(tree.averageGain(m.monk1, m.attributes[4])) print "Average Gain Monk1(a6): " + str(tree.averageGain(m.monk1, m.attributes[5])) print "Average Gain Monk2(a1): " + str(tree.averageGain(m.monk2, m.attributes[0])) print "Average Gain Monk2(a2): " + str(tree.averageGain(m.monk2, m.attributes[1])) print "Average Gain Monk2(a3): " + str(tree.averageGain(m.monk2, m.attributes[2])) print "Average Gain Monk2(a4): " + str(tree.averageGain(m.monk2, m.attributes[3])) print "Average Gain Monk2(a5): " + str(tree.averageGain(m.monk2, m.attributes[4])) print "Average Gain Monk2(a6): " + str(tree.averageGain(m.monk2, m.attributes[5])) print "Average Gain Monk3(a1): " + str(tree.averageGain(m.monk3, m.attributes[0])) print "Average Gain Monk3(a2): " + str(tree.averageGain(m.monk3, m.attributes[1])) print "Average Gain Monk3(a3): " + str(tree.averageGain(m.monk3, m.attributes[2])) print "Average Gain Monk3(a4): " + str(tree.averageGain(m.monk3, m.attributes[3])) print "Average Gain Monk3(a5): " + str(tree.averageGain(m.monk3, m.attributes[4])) print "Average Gain Monk3(a6): " + str(tree.averageGain(m.monk3, m.attributes[5])) #print "Average Gain Level 2 Monk1(a1): " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], value), m.attributes[0])) #draw.drawTree(tree.buildTree(m.monk1, m.attributes, 2)) t=tree.buildTree(m.monk1,m.attributes); print(tree.check(t, m.monk1test)) print(tree.check(t, m.monk1)) t2=tree.buildTree(m.monk2,m.attributes); print(tree.check(t2, m.monk2test)) print(tree.check(t2, m.monk2)) t3=tree.buildTree(m.monk3,m.attributes); print(tree.check(t3, m.monk3test)) print(tree.check(t3, m.monk3))
def compute_entropy(): print ("Compute entropy of training datasets:") ent_table = PrettyTable(['Dataset', 'Entropy']) for i in range(3): l = ["MONK-{0}".format(i+1)] l.append(round(dt.entropy(monks[i]), 10)) ent_table.add_row(l) print(ent_table) print ()
def assignment1(): print("monk1 entropy: ", d.entropy(m.monk1)) print("monk1Test entropy: ", d.entropy(m.monk1test)) print("monk2 entropy: ", d.entropy(m.monk2)) print("monk2test entropy: ", d.entropy(m.monk2test)) print("monk3 entropy: ", d.entropy(m.monk3)) print("monk3test entropy: ", d.entropy(m.monk3test))
def buildTreeCustom(dataset, depth): if (depth > 0): bestAttr = dt.bestAttribute(dataset, m.attributes) print(str(bestAttr), end='') # Select datasets splits for each value of the bestAttr splits = [] for value in bestAttr.values: splits.append(dt.select(dataset, bestAttr, value)) for split in splits: # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth if (dt.entropy(split) > 0): buildTreeCustom(split, depth - 1) else: print('+' if dt.mostCommon(split) else '-', end='') else: print('+' if dt.mostCommon(dataset) else '-', end='')
def ass1(): for dset in [mdata.monk1, mdata.monk2, mdata.monk3]: print("-------- New dataset --------") print("Entropy: " + str(dtree.entropy(dset)))
import monkdata as m import dtree as dt import drawtree as dw def print_average_gains(av_gain, av_gain_name = "", print_range = 6): print(av_gain_name, end = "\t") for i in range(print_range): print(round(av_gain[i], 4), end = "\t") print("") #Assignment 1 calculatee enropy print("") m1e = dt.entropy(m.monk1) m2e = dt.entropy(m.monk2) m3e = dt.entropy(m.monk3) print("Monk1 entropy: ", m1e) print("Monk2 entropy: ", m2e) print("Monk3 entropy: ", m3e) print("") M1 = [] M2 = [None]*6 M3 = [None]*6 for i in range(6): M1.append(dt.averageGain(m.monk1, m.attributes[i])) M2[i] = dt.averageGain(m.monk2, m.attributes[i]) M3[i] = dt.averageGain(m.monk3, m.attributes[i])
# Compute the entropy of monk datasets. import monkdata as m import dtree as d monk1 = d.entropy(m.monk1) print "MONK-1 entropy:", monk1 monk2 = d.entropy(m.monk2) print "MONK-2 entropy:", monk2 monk3 = d.entropy(m.monk3) print "MONK-3 entropy:", monk3
import monkdata as m from dtree import entropy from dtree import averageGain print(entropy(m.monk1), 'monk1') print(entropy(m.monk2), 'monk2') print(entropy(m.monk3), 'monk3') for i in range(6): print("\nattribute ", i) print(averageGain(m.monk1, m.attributes[i])) print(averageGain(m.monk2, m.attributes[i])) print(averageGain(m.monk3, m.attributes[i]))
import monkdata as m import dtree as dt import math as math import random as r #import drawtree as draw #Assignment 1 init_entropy_monk1 = dt.entropy(m.monk1) init_entropy_monk2 = dt.entropy(m.monk2) init_entropy_monk3 = dt.entropy(m.monk3) #Printing results print "-------- Assignment 1 --------" print print "Monk1 entropy: ", init_entropy_monk1 print "Monk2 entropy: ", init_entropy_monk2 print "Monk3 entropy: ", init_entropy_monk3 print print "------------------------------" print "-------- Assignment 2 --------" print gain_monk1 = [] gain_monk2 = [] gain_monk3 = [] for x in range(0, 6): gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x]))
import random import matplotlib.pyplot as plt # Importing lab specific packages. sys.path.append('dectrees-py/') import monkdata as monk import dtree as dt # Needed import for drawing the decision tree. #import drawtree as drawtree # Datasets train = [monk.monk1, monk.monk2, monk.monk3] test = [monk.monk1test, monk.monk2test, monk.monk3test] print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1))) print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2))) print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3))) for i, dataset in enumerate(train): print("") print("Average gain for monk{} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes): print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute))) monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)] for i, monk1 in enumerate(monk1a5): print("") print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1)) for j, attribute in enumerate(monk.attributes):
__author__ = 'jonas' import monkdata as m import dtree if __name__ == "__main__": data = { 'monk1': { 'name': 'MONK-1', 'data': m.monk1, 'entropy': 'NA' }, 'monk2': { 'name': "MONK-2", 'data': m.monk2, 'entropy': 'NA' }, 'monk3': { 'name': 'MONK-3', 'data': m.monk3, 'entropy': 'NA' } } for set in data: data[set]['entropy'] = dtree.entropy(data[set]['data']) print(data[set]['name'] + " entropy: ", data[set]['entropy'])
A4 = m.attributes[3] A5 = m.attributes[4] A6 = m.attributes[5] ## DATASET monk1 = m.monk1 monk2 = m.monk2 monk3 = m.monk3 monktest1 = m.monk1test monktest2 = m.monk2test monktest3 = m.monk3test print("#---------------- Assignment 1 and 2 ----------------#") print(" ") # Entropy Calculation from Monk dataset on training variables print("Entropy Monk1 dataset: ", d.entropy(monk1)) print("Entropy Monk2 dataser: ", d.entropy(monk2)) print("Entropy Monk3 dataset: ", d.entropy(monk3)) print(" ") print("#---------------- Assignment 3 and 4 ----------------#") print(" ") print("Information gain for the MONK1 dataset") for i in range(0, 6): print(" Info Gain ", m.attributes[i], ":", d.averageGain(monk1, m.attributes[i])) print(" ") print("Information gain for the MONK2 dataset") for i in range(0, 6): print(" Info Gain ", m.attributes[i], ":", d.averageGain(monk2, m.attributes[i]))
import monkdata as m import dtree as d import drawtree as l import random from matplotlib import pyplot from numpy import arange #finding entropy print "Assignment 1" print "dataset "+"entropy" print "monk1 "+ str(d.entropy(m.monk1)) print "monk2 "+ str(d.entropy(m.monk2)) print "monk3 "+ str(d.entropy(m.monk3)) print "" print "" #finding information average gain print " Assignment2" att =[x for x in m.attributes] monkar = [m.monk1, m.monk2, m.monk3] for j in monkar: entropyGain=[] for i in att : entropyGain.append(d.averageGain(j,i)) for i in entropyGain:
#!/usr/bin/env python import dtree as d import monkdata as m monkset = [m.monk1, m.monk2, m.monk3] mtrain = [m.monk1test, m.monk2test, m.monk3test] #Assignement 1 print 'Entropy for monk1-3' j = 1 for monk in monkset: #s = '\ta' + str(j++) + ': ' + str(d.entropy(monk)) print d.entropy(monk) #Assignement 2 attributes = [0, 0, 0] print '\nInformation gain for attributes a1 to a6' for i in range(0, len(monkset)): print 'Monk', i+1 s = "" greatest = 0 for x in range(0, 6): averageGain = d.averageGain(monkset[i], m.attributes[x]) if averageGain > greatest: greatest = averageGain s = s + str(averageGain)+ ' ' print s attributes[i] = greatest
__author__ = 'swebo_000' import monkdata as m import dtree as d #import drawtree monkset = [m.monk1, m.monk2, m.monk3] print("1. Entropy of the MONK datasets:") for x in range(0, len(monkset)): print("MONK-%d: %f" % (x+1, d.entropy(monkset[x]))) print(); print("2. Information gain from attributes:") for set in monkset: print("MONK-%d" % (monkset.index(set) + 1)) for x in range(0, len(m.attributes)): print("Attribute %d: %f" %(x+1, d.averageGain(set, m.attributes[x]))) print()
import monkdata as m import dtree as dt print("Entropy\n") print("Monk1: " + str(dt.entropy(m.monk1))) print("Monk2: " + str(dt.entropy(m.monk2))) print("Monk3: " + str(dt.entropy(m.monk3)))
def printEntropy(dataset, nr): # Print the entropy for the dataset print("Entropy(monk"+str(nr+1)+"): "+str(d.entropy(dataset)))
for temp in prun_set: s_dict[temp] = (d.check(temp, data_set.Test)) return key_with_maxval(s_dict) def test_pruning_algo(train_data, test_data, ratio): monk_set = SplitDataSet() # Here some uncertainty occurs. monk_set.Train, monk_set.Test = partition(train_data, ratio) final_tree = check_pruning(monk_set) accuracy = d.check(final_tree, test_data) #print("Accuracy for Monk1.test", accuracy) return accuracy print(d.entropy(m.monk1)) print(d.entropy(m.monk2)) print(d.entropy(m.monk3)) #Printout the entropy of all datasets. a = list() b = list() c = list() for i in range(0, 6, 1): a.append(d.averageGain(m.monk1, m.attributes[i])) for i in range(0, 6, 1): b.append(d.averageGain(m.monk2, m.attributes[i])) for i in range(0, 6, 1): c.append(d.averageGain(m.monk3, m.attributes[i]))
import matplotlib.pyplot as plt #Assignment 0 #Monk 1: a1 and a2 related and it is hard to split on one of the attributes #Monk 2: True concept has the value of an attribute involved with the value of another attribute and therefore hard to split based on single attribute #Monk 3: Contains noice and has the smallest set of training data. Alla datasets have small training sets compared to testing sets. #Assignment 1: monk = [m.monk1, m.monk2, m.monk3] entropy_table = PrettyTable(["Dataset", "Entropy"]) for i in range(len(monk)): row = ["MONK-{0}".format(i + 1), round(d.entropy(monk[i]), 10)] entropy_table.add_row(row) print(entropy_table) #Assignment 2: #Assignment 3: #info_gain_table = PrettyTable(["Dataset", "A1", "A2", "A3", "A4", "A5", "A6"]) header = ["Dataset"] for attr in m.attributes: header.append(attr) info_gain_table = PrettyTable(header) #for i in range(3):
import monkdata as m import dtree as dtree print "Sample class: " + m.Sample.__doc__ # for sample in m.monk1: # print sample.positive, sample.identity, " |||| ", sample.attribute #print "\n\n\n" print "MONK 1 ", dtree.entropy(m.monk1) print "MONK 2 ", dtree.entropy(m.monk2) print "MONK 3 ", dtree.entropy(m.monk3) print "MONK 1 TEST ", dtree.entropy(m.monk1test) print "MONK 2 TEST ", dtree.entropy(m.monk2test) print "MONK 3 TEST ", dtree.entropy(m.monk3test)
def main(): # Assignement 1 print("Assignement 1") monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3] monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test] entropies = [dtree.entropy(monk) for monk in monks] print("*** Monk1 entropy: ", entropies[0]) print("*** Monk2 entropy: ", entropies[1]) print("*** Monk3 entropy: ", entropies[2]) # Assignement 3 print(" ") print("Assignement 3") attributes = monkdata.attributes info_gain1 = info_gain(monks[0], attributes) info_gain2 = info_gain(monks[1], attributes) info_gain3 = info_gain(monks[2], attributes) print("*** Monk1 information gain for attribute:", ['%.5f' % x for x in info_gain1]) print("*** Monk2 information gain for attribute:", ['%.5f' % x for x in info_gain2]) print("*** Monk3 information gain for attribute:", ['%.5f' % x for x in info_gain3]) # Assignement 5 print("") print("Assignement 5") print("*** Attribute:", np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset") print("*** Attribute:", np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset") print("*** Attribute:", np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset") print("***") max0 = np.argmax(info_gain1) # attribute of first split attributes_left = [ attrib for attrib in attributes if attrib != attributes[max0] ] print("*** 1) Attributes the next nodes should be tested on: ", attributes_left) # Attributes to split on in second step splits = [ np.argmax( info_gain(dtree.select(monks[0], attributes[max0], value), attributes)) + 1 for value in attributes[max0].values ] print("*** 2) Second split is on the attriburtes: ", splits) # Decision after second split subsets = [ dtree.select(monks[0], attributes[max0], split) for split in splits ] print("*** 3) Assignement after second split: ", [dtree.mostCommon(subset) for subset in subsets]) print("***") print("*** Train and test set errors") t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes) print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes) print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2), " Etest=", 1 - dtree.check(t2, monkdata.monk2test)) t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes) print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3), " Etest=", 1 - dtree.check(t3, monkdata.monk3test)) import drawtree_qt5 #print(t1) # tree in text form(weird) #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree # Assignement 7 print("") print("Assignement 7") # The prunning for the exanple of monk1 monk1train, monk1val = partition(monkdata.monk1, 0.9) t1 = dtree.buildTree(monk1train, monkdata.attributes) # tree trained from monk1train t11 = prune(t1, monk1val) # prunned tree print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=", 1 - dtree.check(t1, monkdata.monk1test)) print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=", 1 - dtree.check(t11, monkdata.monk1test)) # Statistic information for different fraction for monk1 and monk3 fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] # Evaluation of Monk1 eval1 = [ evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test) for frac in fraction ] means1 = [np.mean(x) for x in eval1] vars1 = [np.var(x) for x in eval1] plt.figure(1) plt.subplot(121) plt.plot(fraction, means1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars1, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk1') # Evaluation of Monk2 eval3 = [ evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test) for frac in fraction ] means3 = [np.mean(x) for x in eval3] vars3 = [np.var(x) for x in eval3] plt.figure(2) plt.subplot(121) plt.plot(fraction, means3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Mean of error for different " + r'$\lambda$s') plt.subplot(122) plt.plot(fraction, vars3, 'ro') plt.xlabel(r'$\lambda$') plt.title("Variance of error for different " + r'$\lambda$s') plt.suptitle('Monk2') plt.show()
import monkdata as m import math import dtree # Assignment 1 # ################ #Setting up lists monk_entropy = [] data_sets = [m.monk1, m.monk2, m.monk3] #calculating entropy of the monk sets for set in data_sets: monk_entropy.append(dtree.entropy(set)) #print(monk_entropy) # Assignment 2 # ################ #Setting up lists info_gain_m1 = [] info_gain_m2 = [] info_gain_m3 = [] attribute = [] #starting counter i = 0; #iterating over all the test sets for sets in [info_gain_m1, info_gain_m2, info_gain_m3]:
r.shuffle(ldata) breakPoint = int(len(ldata) * fraction) return ldata[:breakPoint], ldata[breakPoint:] def getClasification(dataset,fraction): monk1train, monk1val = partition(dataset,fraction) testTree = tree.buildTree(monk1val,m.attributes) prunedTrees = tree.allPruned(testTree) pValue = 0 for pruned in prunedTrees: if(tree.check(pruned,monk1train) > pValue): bestTree = pruned pValue = tree.check(pruned,monk1train) return pValue, bestTree print "Entropy Monk1: " + str(tree.entropy(m.monk1)) print "Entropy Monk2: " + str(tree.entropy(m.monk2)) print "Entropy Monk3: " + str(tree.entropy(m.monk3)) print "Gain Monk1 a1: " + str(tree.averageGain(m.monk1,m.attributes[0])) print "Gain Monk1 a2: " + str(tree.averageGain(m.monk1,m.attributes[1])) print "Gain Monk1 a3: " + str(tree.averageGain(m.monk1,m.attributes[2])) print "Gain Monk1 a4: " + str(tree.averageGain(m.monk1,m.attributes[3])) print "Gain Monk1 a5: " + str(tree.averageGain(m.monk1,m.attributes[4])) print "Gain Monk1 a6: " + str(tree.averageGain(m.monk1,m.attributes[5])) print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0])) print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1])) print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2])) print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3])) print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
def A1(): print "Entropy for Monk1 is : ", dT.entropy( m.monk1 ) print "Entropy for Monk2 is : ", dT.entropy( m.monk2 ) print "Entropy for Monk3 is : ", dT.entropy( m.monk3 )
def main(): print ("Entropy monk1") entropy1 = tree.entropy(data.monk1) print (entropy1) print ("\n") print ("Entropy monk2") entropy2 = tree.entropy(data.monk2) print (entropy2) print ("\n") print ("Entropy monk3") entropy3 = tree.entropy(data.monk3) print (entropy3) print ("\n") informationGain(data) #COMPUTING ENTROPY FOR SUBSET, WhY 0?! monk1Tree = tree.buildTree(data.monk1, data.attributes) #draw.drawTree(monk1Tree) #print(tree.bestAttribute(data.monk3, data.attributes)) subSet = tree.select(data.monk1, data.attributes[4], 1) # newEntropy = tree.entropy(subSet) # print ("SubSet") # print (newEntropy) #END n = 0 sumList = np.array([0.0] * 6) l1 = [] l2 = [] l3 = [] l4 = [] l5 = [] l6 = [] for x in range(100): errorList = np.array(pruneTree(data.monk1, data.monk1test)) sumList += errorList l1.append(errorList[0]) l2.append(errorList[1]) l3.append(errorList[2]) l4.append(errorList[3]) l5.append(errorList[4]) l6.append(errorList[5]) finalList = sumList/100 stdDevList = [np.std(l1),np.std(l2),np.std(l3),np.std(l4), np.std(l5),np.std(l6)] print(finalList) print(stdDevList) line1, = plt.plot(finalList, label="Monk1 means", marker='o') # Create a legend for the first line. first_legend = plt.legend(handles=[line1], loc=1) x = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] # create an index for each tick position xi = [i for i in range(0, len(x))] plt.xticks(xi, x) plt.ylabel('Mean Errors') plt.xlabel('Fractions') plt.show()
def assignment1(): print " ", "Entropy" print "M1 ", d.entropy(m.monk1) print "M2 ", d.entropy(m.monk2) print "M3 ", d.entropy(m.monk3)
import monkdata as m import dtree as dt import drawtree as draw import matplotlib.pyplot as plt import random, operator #Entorpy #calling the predefined function that calculates the entropy for all the three datasets #assignment 1 print dt.entropy(m.monk1) print dt.entropy(m.monk2) print dt.entropy(m.monk3) print '\n' ############################################################################## #Information Gain #cycles for calling average gains for all the three datasets and for every attribute #assignment 2 for atr in m.attributes: gain = dt.averageGain(m.monk1, atr) print gain print '\n' for atr in m.attributes: print dt.averageGain(m.monk2, atr) print '\n' for atr in m.attributes: print dt.averageGain(m.monk3, atr)
def A1(): print "Entropy for Monk1 is : ", dT.entropy(m.monk1) print "Entropy for Monk2 is : ", dT.entropy(m.monk2) print "Entropy for Monk3 is : ", dT.entropy(m.monk3)
def assignment1(): print("Assignment 1") print("MONK-1: %f\nMONK-2: %f\nMONK-3: %f" % tuple([d.entropy(x) for x in (m.monk1, m.monk2, m.monk3)])) print("Note: The impurity of MONK-1 is 1. This can only happen when there is an equal amount of true and false samples in the data.")
""" DD2431 HT15 Lab 1 """ import monkdata as m import dtree as t import drawtree as draw import random "3 ENTROPY" "--Assignment 1" print("--------------------------------------") print("Assignment 1: Entropy of training sets") monkEntropy = [round(t.entropy(m.monk1), 5), round(t.entropy(m.monk2), 5), round(t.entropy(m.monk3), 5)] "--Answer to Assignment 1" print(monkEntropy, "\n") "4 INFORMATION GAIN" "--Assignment 2" monkTrainingSets = [m.monk1, m.monk2, m.monk3] informationGain = [] print("Assignment 2: Expected information gains") att = [] # save values for each attribute for monk in monkTrainingSets: # for each data set for attribute in m.attributes: # for every attribute # calculate the gain of splitting by the attribute att.append(round(t.averageGain(monk, attribute), 5)) informationGain.append(att) # save a "row vector"
#!/usr/bin/env python import dtree import monkdata import random import matplotlib.pyplot as plot sets = [monkdata.monk1, monkdata.monk2, monkdata.monk3] entropies = [dtree.entropy(s) for s in sets] def printlines(values): for line in values: print(', '.join(map(str, line))) print("Initial entropies:") print(entropies) print("") gain = [[dtree.averageGain(s, attr) for attr in monkdata.attributes] for s in sets] print("Expected gain:") printlines(gain) print("") def tests(pair): tree=dtree.buildTree(pair[0], monkdata.attributes) return [ pair[2], dtree.check(tree,pair[0]), dtree.check(tree,pair[1])
"Calculate the entropy of a dataset" #nr of monk1 records n = len(dataset) # nr of monk1 records with postive = True nPos = len([x for x in dataset if x.positive]) #nr of monk1 records with positive = False nNeg = n - nPos #if all records are negative or all are positive than entropy is 0 since one can immediately classify or predict unlabeled records. if nPos == 0 or nNeg == 0: return 0.0 #Entropy calc return -float(nPos)/n * math.log(float(nPos)/n,2) + \ -float(nNeg)/n * math.log(float(nNeg)/n,2) print('Monk1 entropy:', dtree.entropy(m.monk1)) print('Monk2 entropy:', dtree.entropy(m.monk2)) print('Monk3 entropy:', dtree.entropy(m.monk3)) "Entropy Calculation" "Information Gain calculation" def averageGain(dataset, attribute): "Calculate the expected information gain when an attribute becomes known" weighted = 0.0 #ex monk1: A1 attribute values are {1,2,3} v= 1 or 2 or 3 for v in attribute.values: #ex monk1: for v=1 subset = (True, (1, 1, 1, 1, 3, 1), 5) , v=2 subset= (False, (2, 1, 1, 1, 3, 1), 149), v=3 subset = (True, (3, 1, 1, 1, 1, 1), 289) #select: selects all samples with attribute= v
def compute_entropy(datasets_names, datasets): for dataset_name, dataset in zip(datasets_names, datasets): print(dataset_name, ':', round(d.entropy(dataset), 3))
def calcentropy(): print("Entropy Monk 1: %f" % d.entropy(m.monk1)) print("Entropy Monk 2: %f" % d.entropy(m.monk2)) print("Entropy Monk 3: %f" % d.entropy(m.monk3))
import monkdata import dtree e1 = dtree.entropy(monkdata.monk1) e2 = dtree.entropy(monkdata.monk2) e3 = dtree.entropy(monkdata.monk3) print("Entropy Monk-1: {}".format(e1)) print("Entropy Monk-2: {}".format(e2)) print("Entropy Monk-3: {}".format(e3))
return ldata[:breakPoint], ldata[breakPoint:] monk1train, monk1val = partition(m.monk1, 0.6) def prune_tree(tree, validation): pruned_trees = d.allPruned(tree) pruned_trees_performance = [0 for x in range(len(pruned_trees))] for candidate in pruned_trees: index = pruned_trees.index(candidate) pruned_trees_performance[index] = d.check(candidate, validation) if d.check(tree, validation) <= max(pruned_trees_performance): tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))] tree = prune_tree(tree, validation) return tree print(d.entropy(m.monk1)) print(d.entropy(m.monk2)) print(d.entropy(m.monk3)) print("\n") print("monk-1: %f %f %f %f %f %f" % ( d.averageGain(m.monk1, m.attributes[0]), d.averageGain(m.monk1, m.attributes[1]), d.averageGain(m.monk1, m.attributes[2]), d.averageGain(m.monk1, m.attributes[3]), d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5]) )) print("monk-2: %f %f %f %f %f %f" % ( d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]), d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]), d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5]) ))
def entropy(datasets): return [dtree.entropy(d) for d in datasets]
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Thu Jan 26 12:58:18 2017 """ import numpy as np import monkdata as m import dtree as d import drawtree_qt5 as dqt import matplotlib.pyplot as plt ## Assignment 1: calculate entropy of a dataset en_m1 = d.entropy(m.monk1) en_m2 = d.entropy(m.monk2) en_m3 = d.entropy(m.monk3) # output print print '-------- Assignment 1 --------' print 'entropy:' print 'monk 1: ' + str(en_m1) print 'monk 2: ' + str(en_m2) print 'monk 3: ' + str(en_m3) print '' ## Assignment 3: calculate information gain Ga_m1 = np.empty([6,1], dtype = float) Ga_m2 = np.empty([6,1], dtype = float)
import monkdata as m import dtree import drawtree_qt5 as draw import numpy as np import matplotlib.pyplot as plt import random entropyMonk1 = dtree.entropy(m.monk1) entropyMonk2 = dtree.entropy(m.monk2) entropyMonk3 = dtree.entropy(m.monk3) print(f'Entropy for monk1: {entropyMonk1}') print(f'Entropy for monk2: {entropyMonk2}') print(f'Entropy for monk3: {entropyMonk3}') informationGainMonk1 = list( map(lambda x: dtree.averageGain(m.monk1, x), m.attributes)) informationGainMonk2 = list( map(lambda x: dtree.averageGain(m.monk2, x), m.attributes)) informationGainMonk3 = list( map(lambda x: dtree.averageGain(m.monk3, x), m.attributes)) print( f'Information gain for all 6 attuributes for monk1: {informationGainMonk1}' ) print( f'Information gain for all 6 attuributes for monk2: {informationGainMonk2}' ) print( f'Information gain for all 6 attuributes for monk3: {informationGainMonk3}' )
def myBuildTree(dataset, levels): treeLevels = [] splits = [] treeLevels.append(dataset) datasubsets = dataset datasubsetsAvgGains = [] for level in range(0, levels): print("\n===Level #: ", level) if level == 0: attribAvgGains = [] largestGain = 0 largestAttribIndex = 0 if len(datasubsets) > 5: for attribute in range(0, len(m.attributes)): avgGain = d.averageGain(datasubsets, m.attributes[attribute]) if avgGain > largestGain: largestGain = avgGain largestAttribIndex = attribute attribAvgGains.append(avgGain) print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain) datasubsetsAvgGains.append(attribAvgGains) print("---Splitting at attribute: ", m.attributes[largestAttribIndex]) datasubsets = split(datasubsets, m.attributes[largestAttribIndex]) splits.append(m.attributes[largestAttribIndex]) treeLevels.append(datasubsets) elif level > 0: print("---No. of datasets: ", len(datasubsets)) newdatasubsets = [] for i in range(0, len(datasubsets)): print("\n---Datasubset: ", i, "\t\tEntropy: ", d.entropy(datasubsets[i])) attribAvgGains = [] newdatasubsets = [] largestGain = 0 largestAttribIndex = 0 if len(datasubsets[i]) > 5: for attribute in range(0, len(m.attributes)): avgGain = d.averageGain(datasubsets[i], m.attributes[attribute]) if avgGain > largestGain: largestGain = avgGain largestAttribIndex = attribute attribAvgGains.append(avgGain) print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain) if avgGain > 0: print("---Splitting at attribute: ", m.attributes[largestAttribIndex].name) newdatasubsets.append(split(datasubsets[i], m.attributes[largestAttribIndex])) splits.append(m.attributes[largestAttribIndex]) else: print( "---Skipping splitting at attribute: ", m.attributes[largestAttribIndex].name, "Dataset #", i, ) datasubsetsAvgGains.append(attribAvgGains) if len(newdatasubsets[0]) > 1: datasubsets = newdatasubsets[0] print("---No. of New datasets: ", len(datasubsets)) treeLevels.append(datasubsets) return splits
# learn, since "ai = 1 for exactly two i of {1, 2, ..., 6} is difficult to # express concisely with binary questions. Low information gain from each # sub question. On the other hand, it has more training data. # # MONK-3 has the least amount of training data. Apart from that, it also has # random noise added. # # MONK-2 has the lowest entropy. ################################################################################ ################################# Assignment 1 ################################# # Calculate the entropy of the training datasets. print("\n\nAssignment 1 - Calculate the entropy of each training dataset") entropy_monk1 = d.entropy(m.monk1) # Yields 1.0 because there's a 50/50 split. entropy_monk2 = d.entropy(m.monk2) # Yields 0.9571.... entropy_monk3 = d.entropy(m.monk3) # yields 0.9998.... print() print("monk1: ", entropy_monk1) print("monk2: ", entropy_monk2) print("monk3: ", entropy_monk3) ################################################################################ ################################# Assignment 2 ################################# # Explain entropy for a uniform distribution and a non-uniform distribution, # present some example distributions with high and low entropy. # # "(Shannon) entropy is a measure of uncertainty" # In the case of a uniform distribution, different outcomes have an equal # probability of being picked. An example is a (non-weighted) die, which would
import monkdata as m import dtree as dt import drawtree as draw entropy = dt.entropy(m.monk1) best_gain = 0 for attribute in m.attributes: gain = dt.averageGain(m.monk1, attribute) if gain > best_gain: best_gain = gain best_attribute = attribute for v in best_attribute.values: subset = dt.select(m.monk1, best_attribute, v) majority_class = dt.mostCommon(subset) values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values} print(best_attribute, values) draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))