def pruneTree(train, validation, acc_desired): t = d.buildTree(train, m.attributes) accuracy = d.check(t, validation) accuracy_p = accuracy #print("Starting accuracy:" + str(accuracy)) temp = t tt = 0 while (tt < acc_desired): tt += 1 temp = t tlist = d.allPruned(t) accuracy_p = 0 for i in range(0, len(tlist)): #print(i) accuracy = d.check(tlist[i], validation) #print("Pruned tree no " + str(i) + " accuracy: " + str(accuracy)) #print(accuracy_p) if (accuracy >= accuracy_p): accuracy_p = accuracy #print("Set new accuracy_p: " + str(accuracy_p)) t = tlist[i] #print(str(acc_prev_tree) + " " + str(accuracy_p)) if (d.check(temp, validation) > d.check(t, validation)): t = temp """ print(t) print("Final accuracy: " + str(d.check(t, validation))) pyqt.drawTree(t) """ return t
def prune(dec_tree, val_data): #Flag to keep memory of any best tree one_better = True while one_better: #Obtain all the pruned tress pruned_trees = allPruned(dec_tree) #print("%d pruned tress" % (len(pruned_trees))) dec_tree_perf = check(dec_tree, val_data) #Set local variables one_better = False maxPerf = dec_tree_perf #Compute performance evaluation and keep the best one for tree in pruned_trees: tree_perf = check(tree, val_data) #print("\t NEW(%f), OLD(%f)" % (tree_perf, maxPerf)) if tree_perf >= maxPerf: maxPerf = tree_perf dec_tree = tree one_better = True #print("\tFound a better one: %f" % (tree_perf)) return maxPerf, dec_tree
def assignment4_p3(data, attributes, fraction): trainData, validData = partition(data, fraction) dataTree = d.buildTree(trainData, attributes) orgErr = 1 - d.check(dataTree, validData) # print("ORIGINAL ERR", orgErr) orgTree = dataTree ######################### bestPrunedTreesList = [] toPrune = [] toPrune.append(orgTree) # bestPrunedTreesList.append(orgTree) err = orgErr bestErrorRate = err bestPrunedTreesList = getPrunedChildren(toPrune, bestErrorRate, validData) if len(bestPrunedTreesList) == 0: toReturn = toPrune[0] else: toReturn = bestPrunedTreesList[0] # print(toReturn) # print("No. of best pruned trees:", len(bestPrunedTreesList)) # for i in range(0, len(bestPrunedTreesList)): # print("Pruned Tree No. ", i, "test error rate: ", 1-d.check(bestPrunedTreesList[i], validData)) # print("Pruned Tree ", "test error rate: ", 1-d.check(toReturn, validData)) # return bestPrunedTreesList return 1 - d.check(toReturn, validData)
def calculate_best(Td,Vd): error = -sys.maxsize counter = 0 current_tree = tree.buildTree(Td,m.attributes) tr = tree.buildTree(Td,m.attributes) tr_pruned = tree.allPruned(tr) while True: counter = 0 count = len(tr_pruned) for x in tr_pruned: if tree.check(x,Vd) > error: error = tree.check(x,Vd) current_tree = x #print("current tree") #print(current_tree) #print("error") #print(error) else: counter = counter + 1 if count == counter: break tr = current_tree # print("Selected tree:") #print(tr) #print("error:") #print(error) return error, tr
def pruneTree(dataset, testSet): fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] errorList = [] for x in fractions: train, val = partition(dataset, x) theTree = tree.buildTree(train, data.attributes) list_of_trees = tree.allPruned(theTree) theBest = 1000 bestTree = 0 for t in list_of_trees: error = 1 - tree.check(t, val) if error < theBest: theBest = error bestTree = t draw.drawTree(bestTree) smallest_error_at_fraction = 1 - tree.check(bestTree, testSet) errorList.append(smallest_error_at_fraction) # print ("smalest error") # print (smallest_error_at_fraction) # print ("occured at fraction") # print (x) return errorList
def getData1(iterations): fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] error = [0] * 6 for i in range(6): error[i] = [0] * iterations #print("\nMonk1") for f in range(len(fraction)): #print("\nFactor: %.1f" % f) for i in range(0, iterations): monk1train, monk1val = partition(mdata.monk1, fraction[f]) monk1tree = dtree.buildTree(monk1train, mdata.attributes) while True: prunelist = dtree.allPruned(monk1tree) temptree = monk1tree for x in prunelist: if dtree.check(x, monk1val) >= dtree.check( temptree, monk1val): temptree = x if temptree == monk1tree: break monk1tree = temptree error[f][i] = dtree.check(monk1tree, mdata.monk1test) return error
def buildtree(): for i in range(len(trainingset)): tree=d.buildTree(trainingset[i].dataset,m.attributes) performanceOnTrainData = d.check(tree,trainingset[i].dataset) performanceOnTestData=d.check(tree,testset[i].dataset) print("Error of " + trainingset[i].name+ "on " + testset[i].name + ":" + str(1-performanceOnTestData)) print("Error of " + trainingset[i].name+ "on " + trainingset[i].name + ":" + str(1-performanceOnTrainData))
def pruning(data_set, fraction = 0.6): # A function that returns a pruned decision tree from a data set data_train, data_val = partition(data_set, fraction) # The tree to become pruned tree_pruned = dtree.buildTree(data_train, m.attributes) err_tree_pru = dtree.check(tree_pruned, data_val) # print("Tree before prune:") # print(tree_pruned) better = True while better: better = False trees_alt = dtree.allPruned(tree_pruned) best_prune = None err_best = 0 for alternative in trees_alt: err_alternative = dtree.check(alternative, data_val) if err_alternative >= err_tree_pru and err_alternative > err_best: best_prune = alternative err_best = err_alternative better = True if better: tree_pruned = best_prune err_tree_pru = err_best return tree_pruned
def tests(pair): tree=dtree.buildTree(pair[0], monkdata.attributes) return [ pair[2], dtree.check(tree,pair[0]), dtree.check(tree,pair[1]) ]
def pruning(p1, training, validation): validation_best = 1 - d.check(p1[0], validation) for i in range(len(p1)): validation_current = 1 - d.check(p1[i], validation) if (validation_current < validation_best): validation_best = validation_current return validation_best
def assignment4_p1(data, attributes, fraction): trainData, validData = partition(data, fraction) dataTree = d.buildTree(trainData, attributes) orgErr = 1 - d.check(dataTree, validData) print("ORIGINAL ERR", orgErr) orgTree = dataTree bestPrunedTree = orgTree cont = True while cont: err = orgErr bestErrorRate = err prunedTrees = d.allPruned(bestPrunedTree) print(len(prunedTrees)) for i in range(0, len(prunedTrees)): err = 1 - d.check(prunedTrees[i], validData) print(i, err) if err < bestErrorRate: bestErrorRate = err bestPrunedTree = prunedTrees[i] print("Best Error Rate:", bestPrunedTree, bestErrorRate) if bestErrorRate > orgErr: return orgTree elif bestPrunedTree == dataTree: break # else: # if bestPrunedTree == prunedTrees: # prunedTrees = d.allPruned(bestPrunedTree) orgTree = bestPrunedTree orgErr = bestErrorRate
def plotPruneAccuracy(dataset, test): print("Pruning accuracy results ", "\n") fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] fractionResults = list() bestFraction = fractions[0] bestFractionScore = 0 for fraction in fractions: prune = pruning(dataset, fraction) result = d.check(prune, test) fractionResults.append(result) plt.plot(fraction, result, 'ro') if (d.check(prune, test) > bestFractionScore): bestFractionScore = d.check(prune, test) bestFraction = fraction plt.xlabel("Fraction") plt.ylabel("Classification Accuracy score on a test set") plt.title( "Classisfication accuracy of the pruned tree on test data as a function of partitioning fraction" ) plt.show() print("Best partitioning fraction: ", bestFraction, "\n") print("Fraction score: ", bestFractionScore, "\n") return fractionResults
def best_pruned(base,valid_set): pruned = d.allPruned(base) best = (base,d.check(base,valid_set)) for tree in pruned: perf = d.check(tree,valid_set) if perf >= best[1]: best = (tree, perf) return best
def oneprune(tree, valset): tree_list = [ tr for tr in dtree.allPruned(tree) if dtree.check(tr, valset) > dtree.check(tree, valset) ] if len(tree_list) == 0: return [tree] return [tree for tr in tree_list for tree in oneprune(tr, valset)]
def pruning (p1): validation_best = 1-d.check(p1[0], monk1val) for i in range(len(p1)): validation_current = 1-d.check(p1[i], monk1val) if (validation_current < validation_best) : validation_best = validation_current best = p1[i] return best
def ass3(): test = [mdata.monk1test, mdata.monk2test, mdata.monk3test] count = 0 for dset in [mdata.monk1, mdata.monk2, mdata.monk3]: t = dtree.buildTree(dset, mdata.attributes) print("Training error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, dset))) print("Test error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, test[count]))) count = count + 1
def bestPruned(tree, validationDataset): bestTree = tree while True: tempTree = bestPrunedFromList(bestTree, validationDataset) if (dtree.check(tempTree, validationDataset) >= dtree.check( bestTree, validationDataset)): bestTree = tempTree else: return bestTree
def assignment4(): fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] dataset = [("monk1", m.monk1, m.monk1test), ("monk3", m.monk3, m.monk3test)] l = [] for f in fractions: extraCurry = d.check(assignment4helper(dataset[0][1], f), dataset[0][2]) stektLok = d.check(assignment4helper(dataset[1][1], f), dataset[1][2]) print("%.2f %.2f %.2f" % (f, 1 - extraCurry, 1 - stektLok)) l.append(extraCurry)
def assignment5_id3(): t1 = d.buildTree(m.monk1, m.attributes) #qt.drawTree(t1) print(1 - d.check(t1, m.monk1test)) t2 = d.buildTree(m.monk2, m.attributes) print(1 - d.check(t2, m.monk2test)) #qt.drawTree(t2) t3 = d.buildTree(m.monk3, m.attributes) print(1 - d.check(t3, m.monk3test))
def prune(tree, valSet): currentTree = tree currentPerf = dt.check(currentTree, valSet) pTrees = dt.allPruned(currentTree) for pTree in pTrees: if (dt.check(pTree, valSet) > currentPerf): currentTree = prune(pTree, valSet) currentPerf = dt.check(currentTree, valSet) return currentTree
def pruneNow(tree, data, testData): newVal = 1 - d.check(tree, data) for prunedTree in d.allPruned(tree): val = 1 - d.check(prunedTree, testData) if val < newVal: newVal = val return newVal
def getClasification(dataset,fraction): monk1train, monk1val = partition(dataset,fraction) testTree = tree.buildTree(monk1val,m.attributes) prunedTrees = tree.allPruned(testTree) pValue = 0 for pruned in prunedTrees: if(tree.check(pruned,monk1train) > pValue): bestTree = pruned pValue = tree.check(pruned,monk1train) return pValue, bestTree
def prune_tree(tree, validation): pruned_trees = d.allPruned(tree) pruned_trees_performance = [0 for x in range(len(pruned_trees))] for candidate in pruned_trees: index = pruned_trees.index(candidate) pruned_trees_performance[index] = d.check(candidate, validation) if d.check(tree, validation) <= max(pruned_trees_performance): tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))] tree = prune_tree(tree, validation) return tree
def bestPrunedFromList(tree, validationDataset): listOfTrees = dtree.allPruned(tree) bestValue = dtree.check(tree, validationDataset) bestTree = listOfTrees[len(listOfTrees) - 1] for tree in listOfTrees: temp = dtree.check(tree, validationDataset) if temp > bestValue: bestValue = temp bestTree = tree return bestTree
def buildTreesAndComputePerformance(): """ Assignment 3: """ for i in range(len(trainingSets)): tree = d.buildTree(trainingSets[i].dataset, m.attributes) performanceOnTest = d.check(tree, testSets[i].dataset) performanceOnTrain = d.check(tree, trainingSets[i].dataset) print("Error of " + trainingSets[i].name + " on " + testSets[i].name + ": " + str(1 - performanceOnTest)) print("Error of " + trainingSets[i].name + " on " + trainingSets[i].name + ": " + str(1 - performanceOnTrain)) print("")
def find_prunned(data_part, f_part): monk1train, monkvalue = partition(data_part, f_part) dtree = tree.buildTree(monk1train, dataset.attributes) prun_list = tree.allPruned(dtree) current_correctness = tree.check(dtree, monkvalue) for current_tree in prun_list: check_correctness = tree.check(current_tree, monkvalue) if check_correctness > current_correctness: current_correctness = check_correctness dtree = current_tree return dtree
def assignment3(): print "--- Assignment 3 ---" print "Performance of the decision trees" table = Texttable(max_width=100) table.add_row(["Dataset", "Training", "Test"]) for i in range(3): tree = d.buildTree(monkdata[i],m.attributes) perf = [d.check(tree, monkdata[i]), d.check(tree, testdata[i])] table.add_row(["Monk-" + str(i+1)] + perf) print table.draw() print
def test_pruning(dataset, testset): fraction_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] print ("TESTING PRUNING") for fraction in fraction_list: print("--------------") print(fraction) monk_tree = d.buildTree(dataset,m.attributes) training, validation = partition(dataset, fraction) pruned_monk_tree = prune_tree(monk_tree,validation) print(d.check(monk_tree, testset)) print(d.check(pruned_monk_tree, testset)) print("--------------")
def prunedTree(training, validation): tree = dtree.buildTree(training, m.attributes) poss = dtree.allPruned(tree) scores = [] for i in range(len(poss)): scores.append(dtree.check(poss[i], validation)) while max(scores) >= dtree.check(tree, validation): tree = poss[scores.index(max(scores))] poss = dtree.allPruned(tree) scores = [] for i in range(len(poss)): scores.append(dtree.check(poss[i], validation)) return tree
def checkperformance(tree, monk1val): pruned_trees = d.allPruned(tree) t1_better_performance = -1 best_tree = None for t in pruned_trees: if t1_better_performance < d.check(t, monk1val): t1_better_performance = d.check(t, monk1val) best_tree = t if t1_better_performance >= d.check(tree, monk1val): return checkperformance(best_tree, monk1val) return tree
def prune_tree(monkdata_set, num_trials=50): """ Randomizes data and then splits into partitions based on partition_fractions Creates a tree based on the first partition (training data) Prunes that tree multiple times to see effect of pruning and partition on accuracy Returns a dict with partition_fraction mapped to best accuracy list :param monkdata_set: monkdata set from monkdata.py :param num_trials: number of trials to run :returns dict: partition_fraction mapped to a list of tuples """ partition_fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] #Key: partition_fraction. Value: list of max accuracy in the pruning partition_accuracy = OrderedDict() for i in range(0, num_trials): for partition_fraction in partition_fractions: monk_training, monk_validation = partition(monkdata_set, partition_fraction) tree = dtree.buildTree(monk_training, monkdata.attributes) accuracy = dtree.check(tree, monk_validation) prune_counter = 0 max_accuracy = accuracy max_accuracy_prune = 0 pruned_trees = dtree.allPruned(tree) for pruned_tree in pruned_trees: prune_counter += 1 pruned_accuracy = dtree.check(pruned_tree, monk_validation) #Keep track of the largest prune_accuracy and number if pruned_accuracy > max_accuracy: max_accuracy = pruned_accuracy max_accuracy_prune = prune_counter #If we haven't stored the fraction yet, create a new array if not partition_fraction in partition_accuracy: partition_accuracy[partition_fraction] = list() #Add our most recent trial result there prune_result = (max_accuracy_prune, max_accuracy) partition_accuracy[partition_fraction].append(prune_result) return partition_accuracy
def optimisePartitions3(): #runs tree3 = d.buildTree(m.monk3, m.attributes) score3 = d.check(tree3, m.monk3test) print("Performance of monk3 tree: " + str(score3) + "\n") for index, partition in enumerate(partitions): for j in range(runs): train3, val3 = d.partition(m.monk3, partition) tree3a = d.buildTree(train3, m.attributes) best3 = bestPrunedTree(tree3a, val3) bigList3.append(1 - d.check(best3, m.monk3test)) errorList3.append(sum(bigList3) / len(bigList3)) varianceList3.append(variance(bigList3, errorList3[index])) return errorList3, varianceList3
def optimisePartitions1(): #runs tree1 = d.buildTree(m.monk1, m.attributes) score1 = d.check(tree1, m.monk1test) print("Performance of monk1 tree: " + str(score1) + "\n") for index, partition in enumerate(partitions): for j in range(runs): train1, val3 = d.partition(m.monk1, partition) tree1a = d.buildTree(train1, m.attributes) best1 = bestPrunedTree(tree1a, val3) bigList1.append(1 - d.check(best1, m.monk1test)) errorList1.append(sum(bigList1) / len(bigList1)) varianceList1.append(variance(bigList1, errorList1[index])) return errorList1, varianceList1
def assignment_7(monktrain, monkval): t = dtree.buildTree(monktrain, m.attributes) p1 = performance = dtree.check(t, monkval) better_found = True while better_found: prunes = dtree.allPruned(t) better_found = False for prune in prunes: tmp_performance = dtree.check(prune, monkval) if tmp_performance > performance: t = prune performance = tmp_performance better_found = True return p1, dtree.check(t, monkval)
def findBestPrune(tree, validationdata): prunedtree = d.allPruned(tree) besttree = tree bestperformance = d.check(besttree, validationdata) for candidatetree in prunedtree: candidateperformance = d.check(candidatetree, validationdata) # just take greater because all prunes returns the original tree as well? if (candidateperformance > bestperformance): besttree = candidatetree bestperformance = candidateperformance if besttree == tree: return tree else: return findBestPrune(besttree, validationdata)
def pruneTree(trainSet, fraction): monktrain, monkval = partition(trainSet, fraction) bestTree = dtree.buildTree(monktrain, m.attributes) treePermutations = dtree.allPruned(bestTree) bestVal = dtree.check(bestTree, monkval) for treeP in treePermutations: treePerformance = dtree.check(treeP, monkval) if (treePerformance > bestVal): bestTree = treeP bestVal = treePerformance return bestVal, bestTree, monkval
def find_best_pruned_tree(tree, validate): best_perf = d.check(tree, validate) forest = d.allPruned(tree) temp_tree = None best_tree = tree for t in forest: temp_perf = d.check(t, validate) if temp_perf > best_perf: best_perf = temp_perf best_tree = tree return best_tree, best_perf
def best_pruned_tree(dataset, fraction): train, val = partition(dataset, fraction) tree = dt.buildTree(train, m.attributes) improved = True while improved: improved = False best_performance = dt.check(tree, val) for pruned_tree in dt.allPruned(tree): performance = dt.check(pruned_tree, val) if performance > best_performance: best_performance = performance tree = pruned_tree improved = True return tree
def check_correct_incorrect_classification(datasets, test_datasets, datasets_names): datasets_trees = perform_buildTree(datasets) check = {} check_e = np.zeros((len(datasets), 2)) for i, dataset, dataset_name, dataset_tree, test_dataset in zip( range(len(datasets)), datasets, datasets_names, datasets_trees, test_datasets): correct_classification = round(d.check(dataset_tree, test_dataset), 3) check[dataset_name] = correct_classification err = round(1 - d.check(dataset_tree, dataset), 3), round( (1 - correct_classification), 3) check_e[i] = err return check, check_e
def prune_tree(tree, validation_set): cur_tree = tree while 1: alternatives = dtree.allPruned(cur_tree) best_acc = dtree.check(cur_tree, validation_set) best_alt = cur_tree for alt in alternatives: alt_acc = dtree.check(alt, validation_set) if alt_acc >= best_acc: best_acc = alt_acc best_alt = alt if best_alt == cur_tree: return cur_tree cur_tree = best_alt
def A3(): t1 = dT.buildTree( m.monk1, m.attributes ) print( dT.check( t1, m.monk1test ) ) print( dT.check( t1, m.monk1 ) ) print '\n' #draw.drawTree( t1 ) t2 = dT.buildTree( m.monk2, m.attributes ) print( dT.check( t2, m.monk2test ) ) print '\n' #draw.drawTree( t2 ) t3 = dT.buildTree( m.monk3, m.attributes ) print( dT.check( t3, m.monk3test ) ) print '\n'
def prune(t, val): bestTree = t bestPerf = d.check(t, val) found = True while(found): found = False trees = d.allPruned(bestTree) for tree in trees: perf = d.check(tree, val) if(perf >= bestPerf): bestTree = tree bestPerf = perf found = True return bestTree
def prune(pruned_tree, test_tree): currentBase = pruned_tree oldVal = 0 maxVal = 1 while maxVal > oldVal: maxVal = dt.check(currentBase, test_tree) oldVal = maxVal maxTree = currentBase for pTree in dt.allPruned(currentBase): temp = dt.check(pTree, test_tree) if temp > maxVal: maxVal = temp maxTree = pTree currentBase = maxTree return maxTree
def prun(tree, val): candidates = {} pruns = dt.allPruned(tree) for p in pruns: performance = dt.check(p, val) candidates[p] = performance return candidates
def check_tree_performance(tree, testset): ''' Builds a dec tree from dataset and checks the performance on test set ''' performance_test = dt.check(tree, testset) return performance_test
def getPrunedChildren(toPrune, bestErrorRate, validData): bestPrunedTreesGrandChildren = [] for bestPrunedTreeIndex in range(0, len(toPrune)): # print(toPrune[bestPrunedTreeIndex]) prunedTreesChildren = [] prunedTreesChildren = d.allPruned(toPrune[bestPrunedTreeIndex]) # print(len(prunedTreesChildren)) notFound = False for i in range(0, len(prunedTreesChildren)): tempPrunedTreesGrandChildren = [] err = 1 - d.check(prunedTreesChildren[i], validData) # print(i, err) if err <= bestErrorRate: # bestErrorRate = err tempPrunedTreesGrandChildren.append(getPrunedChildren([prunedTreesChildren[i]], err, validData)) else: notFound = True # print("Best Error Rate:", prunedTreesChildren[i], bestErrorRate) # print(len(tempPrunedTreesGrandChildren)) if notFound: tempPrunedTreesGrandChildren.append(toPrune[bestPrunedTreeIndex]) bestPrunedTreesGrandChildren += tempPrunedTreesGrandChildren # print(len(bestPrunedTreesGrandChildren)) return bestPrunedTreesGrandChildren
def pruned_tree_performance(training_set, test_set, fraction): total_performance = 0 for i in range(AGGREGATE_TIMES): tree = best_pruned_tree(training_set, fraction) total_performance += dt.check(tree, test_set) average_performance = total_performance / AGGREGATE_TIMES return average_performance
def pruneTree(tree, validation): run = True bestGain = 0 prunedTrees = d.allPruned(tree) while run: currentgain = 0 maxgain = 0 besttree = 0 #print("Number of possible prunings: %d" % len(prunedTrees)) for x in range(0, len(prunedTrees)): currentgain = d.check(prunedTrees[x], validation) #print("Rate for tree %d: %f " % (x + 1, currentgain)) if(currentgain > maxgain): maxgain = currentgain; bestTree = prunedTrees[x] prunedTrees = d.allPruned(bestTree) if(maxgain > bestGain): bestGain = maxgain else: run = False #print("Max accuracy reached. Pruning stopped.") #print("Best accuracy: %f" % bestGain); return bestTree
def assignment4(): print "--- Assignment 4 ---" print "Selecting the best fraction to divide training and validation sets for pruning" table = Texttable(max_width=100) table.add_row(["Dataset", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "Benchmark"]) for i in range(3): row = ["Monk-" + str(i+1)] for frac in [(x * 0.1) for x in range(3,9)]: train_set, valid_set = m.partition(monkdata[i], frac) base = d.buildTree(train_set,m.attributes) best = best_pruned(base,valid_set) true_perf = d.check(best[0],testdata[i]) row += [true_perf] row += [d.check(d.buildTree(monkdata[i],m.attributes),testdata[i])] table.add_row(row) print table.draw() print
def findPrunned(t, monk1val1) : t2=[] t2 = d.allPruned(t) maxi1 = d.check(t,monk1val1) maxi2 = maxi1 for s in t2: val = d.check(s,monk1val1) if val < maxi1 : maxi1 = val answertree = s if maxi1 == maxi2 : answertree = t efficiency.append(maxi1) print maxi1 return maxi1 else : x = findPrunned(answertree,monk1val1)
def pruning( trainingSet, testSet, fraction ): train1, train2 = partition( trainingSet, fraction ) bestTree = dT.buildTree( train1, m.attributes ) bestTreePerf = dT.check( bestTree, train2 ) bestTreeFound = True while bestTreeFound == True: bestTreeFound = False prunedTrees = dT.allPruned( bestTree ) for candidateTree in prunedTrees: if dT.check( candidateTree, train2 ) >= bestTreePerf: bestTree = candidateTree bestTreePerf = dT.check( candidateTree, train2 ) bestTreeFound = True return dT.check( bestTree, testSet )
def findBestPrune(tree, validationSet): # print("tree") # print(tree) current=tree while True: currentPerformance=dtree.check(current, validationSet) pruned=dtree.allPruned(current) if pruned == (): break # print("current") # print(current) # print("pruned trees") # print(len(pruned)) performances=map(lambda t : dtree.check(t, validationSet), pruned) best, i=max(izip(performances,count())) # ask which trees we should pick when performance is equal? min depth, min average depth, min no of nodes, order in allPruned if best < currentPerformance: break current = pruned[i] return current
def generateErrorTable(dataset, testset, fractions, tries): result=[] for x in fractions: acc = 0 for i in range(tries): trainSet, valSet =partition(dataset, x) tree = dtree.buildTree(trainSet, m.attributes) prunedTree = findBestPrune(tree, valSet) acc += dtree.check(prunedTree, testset) result.append( (x,acc / tries) ) return result
def prune(tree, testdata, performance_ref): #Prunes tree from given test data alternatives = dt.allPruned(tree) best_per = 0 best_tree = None for subtree in alternatives: performance = dt.check(subtree, testdata) if performance > best_per: best_per = performance best_tree = subtree if best_per >= performance_ref: return prune(best_tree, testdata, performance_ref) else: return tree
def findBestTree(tree, compare, lastBest=0, lastBestTree=None): bestTree = lastBestTree bestVal = lastBest for p in d.allPruned(tree): val = d.check(p, compare) if val > bestVal: bestTree = p bestVal = val if(bestVal > lastBest): return findBestTree(bestTree, compare, bestVal, bestTree) else: return bestTree
def build_tree(): print "\n------------------------------\nAssignment 3 - Error\n------------------------------" tree = dt.buildTree(data.monk1, data.attributes) #drawtree.drawTree(tree) print "Dataset\tE(train)\tE(test)" print "Monk1:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk1), 1-dt.check(tree, data.monk1test)) tree = dt.buildTree(data.monk2, data.attributes) print "Monk2:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk2), 1-dt.check(tree, data.monk2test)) tree = dt.buildTree(data.monk3, data.attributes) print "Monk3:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk3), 1-dt.check(tree, data.monk3test))
def prune(): print "\n------------------------------\nAssignment 4 - Pruning\n------------------------------" print "Dataset\t 0.3\t\t 0.4\t\t 0.5\t\t 0.6\t\t 0.7\t\t 0.8" partSizes = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = 0 for size in partSizes: for j in range(100): training, test = partition(data.monk1, size) bestTree = dt.buildTree(training, data.attributes) bestClass = dt.check(bestTree, test) better = True while better: better = False for subTree in dt.allPruned(bestTree): if dt.check(subTree, test) > bestClass: bestTree = subTree bestClass = dt.check(subTree, test) better = True r[i] += (1-dt.check(bestTree, data.monk1test)) i += 1 print "Monk1\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100) r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = 0 for size in partSizes: for j in range(100): training, test = partition(data.monk3, size) bestTree = dt.buildTree(training, data.attributes) bestClass = dt.check(bestTree, test) better = True while better: better = False for subTree in dt.allPruned(bestTree): if dt.check(subTree, test) >= bestClass: bestTree = subTree bestClass = dt.check(subTree, test) better = True r[i] += (1-dt.check(bestTree, data.monk3test)) i += 1 print "Monk3\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100)
def assignment4helper(dataset, fraction): monk1train, monk1val = partition(dataset, fraction) tree = d.buildTree(monk1train, m.attributes) bestTree = None maxVal = -1 cont = True i = 0 while (cont): cont = False i += 1 for t in d.allPruned(tree): val = d.check(t, monk1val) if (val > maxVal): cont = True bestTree = t maxVal = val tree = bestTree # print("#iterations: %d" % i) return tree