Beispiel #1
0
def pruneTree(train, validation, acc_desired):

    t = d.buildTree(train, m.attributes)
    accuracy = d.check(t, validation)
    accuracy_p = accuracy
    #print("Starting accuracy:" + str(accuracy))
    temp = t
    tt = 0
    while (tt < acc_desired):
        tt += 1
        temp = t
        tlist = d.allPruned(t)
        accuracy_p = 0
        for i in range(0, len(tlist)):
            #print(i)
            accuracy = d.check(tlist[i], validation)
            #print("Pruned tree no " + str(i) + " accuracy: " + str(accuracy))
            #print(accuracy_p)
            if (accuracy >= accuracy_p):
                accuracy_p = accuracy
                #print("Set new accuracy_p: " + str(accuracy_p))
                t = tlist[i]

        #print(str(acc_prev_tree) + " " + str(accuracy_p))

    if (d.check(temp, validation) > d.check(t, validation)):
        t = temp
    """ 
    print(t)
    print("Final accuracy: " + str(d.check(t, validation)))
    pyqt.drawTree(t) 
    """
    return t
def prune(dec_tree, val_data):
    #Flag to keep memory of any best tree
    one_better = True

    while one_better:
        #Obtain all the pruned tress
        pruned_trees = allPruned(dec_tree)
        #print("%d pruned tress" % (len(pruned_trees)))
        dec_tree_perf = check(dec_tree, val_data)

        #Set local variables
        one_better = False
        maxPerf = dec_tree_perf

        #Compute performance evaluation and keep the best one
        for tree in pruned_trees:
            tree_perf = check(tree, val_data)
            #print("\t NEW(%f), OLD(%f)" % (tree_perf, maxPerf))
            if tree_perf >= maxPerf:
                maxPerf = tree_perf
                dec_tree = tree
                one_better = True
                #print("\tFound a better one: %f" % (tree_perf))

    return maxPerf, dec_tree
Beispiel #3
0
def assignment4_p3(data, attributes, fraction):
    trainData, validData = partition(data, fraction)
    dataTree = d.buildTree(trainData, attributes)
    orgErr = 1 - d.check(dataTree, validData)
    # print("ORIGINAL ERR", orgErr)
    orgTree = dataTree
    #########################
    bestPrunedTreesList = []
    toPrune = []
    toPrune.append(orgTree)
    # bestPrunedTreesList.append(orgTree)
    err = orgErr
    bestErrorRate = err
    bestPrunedTreesList = getPrunedChildren(toPrune, bestErrorRate, validData)

    if len(bestPrunedTreesList) == 0:
        toReturn = toPrune[0]
    else:
        toReturn = bestPrunedTreesList[0]

    #   print(toReturn)
    # print("No. of best pruned trees:", len(bestPrunedTreesList))
    # for i in range(0, len(bestPrunedTreesList)):
    # print("Pruned Tree No. ", i, "test error rate: ", 1-d.check(bestPrunedTreesList[i], validData))
    #   print("Pruned Tree ", "test error rate: ", 1-d.check(toReturn, validData))

    # return bestPrunedTreesList
    return 1 - d.check(toReturn, validData)
Beispiel #4
0
def calculate_best(Td,Vd):

    error = -sys.maxsize
    counter = 0
    current_tree = tree.buildTree(Td,m.attributes)
    tr = tree.buildTree(Td,m.attributes)
    tr_pruned = tree.allPruned(tr)
    
    while True:
        counter = 0
        count = len(tr_pruned)
        
        for x in tr_pruned:
            if tree.check(x,Vd) > error:
                error = tree.check(x,Vd)
                current_tree = x
                #print("current tree")
                #print(current_tree)
                #print("error")
                #print(error)
            else:
                counter = counter + 1
        
        if count == counter:
            break
            
        tr = current_tree
    
   # print("Selected tree:")
    #print(tr)
    #print("error:")
    #print(error)
    return error, tr
def pruneTree(dataset, testSet):
	
	fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
	errorList = []

	for x in fractions:
		train, val = partition(dataset, x)
		theTree = tree.buildTree(train, data.attributes)

		list_of_trees = tree.allPruned(theTree)


		theBest = 1000
		bestTree = 0

		for t in list_of_trees:
			error = 1 - tree.check(t, val)

			if error < theBest:
				theBest = error
				bestTree = t
		draw.drawTree(bestTree)
		smallest_error_at_fraction = 1 - tree.check(bestTree, testSet)
		errorList.append(smallest_error_at_fraction)

		# print ("smalest error")
		# print (smallest_error_at_fraction)
		# print ("occured at fraction")
		# print (x)

	return errorList
Beispiel #6
0
def getData1(iterations):
    fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    error = [0] * 6
    for i in range(6):
        error[i] = [0] * iterations
    #print("\nMonk1")
    for f in range(len(fraction)):
        #print("\nFactor: %.1f" % f)
        for i in range(0, iterations):
            monk1train, monk1val = partition(mdata.monk1, fraction[f])
            monk1tree = dtree.buildTree(monk1train, mdata.attributes)
            while True:
                prunelist = dtree.allPruned(monk1tree)
                temptree = monk1tree
                for x in prunelist:
                    if dtree.check(x, monk1val) >= dtree.check(
                            temptree, monk1val):
                        temptree = x

                if temptree == monk1tree:
                    break
                monk1tree = temptree

            error[f][i] = dtree.check(monk1tree, mdata.monk1test)
    return error
def buildtree():
    for i in range(len(trainingset)):
        tree=d.buildTree(trainingset[i].dataset,m.attributes)        
        performanceOnTrainData = d.check(tree,trainingset[i].dataset)
        performanceOnTestData=d.check(tree,testset[i].dataset)
        print("Error of " + trainingset[i].name+ "on " + testset[i].name + ":" + str(1-performanceOnTestData))
        print("Error of " + trainingset[i].name+ "on " + trainingset[i].name + ":" + str(1-performanceOnTrainData))
Beispiel #8
0
def pruning(data_set, fraction = 0.6):
    # A function that returns a pruned decision tree from a data set
    data_train, data_val = partition(data_set, fraction)

    # The tree to become pruned
    tree_pruned = dtree.buildTree(data_train, m.attributes)
    err_tree_pru = dtree.check(tree_pruned, data_val)
#    print("Tree before prune:")
#    print(tree_pruned)

    better = True
    while better:
        better = False
        trees_alt = dtree.allPruned(tree_pruned)
        best_prune = None
        err_best = 0

        for alternative in trees_alt:
            err_alternative = dtree.check(alternative, data_val)

            if err_alternative >= err_tree_pru and err_alternative > err_best:
                best_prune = alternative
                err_best = err_alternative
                better = True

        if better:
            tree_pruned = best_prune
            err_tree_pru = err_best

    return tree_pruned
Beispiel #9
0
def tests(pair):
    tree=dtree.buildTree(pair[0], monkdata.attributes)
    return [
            pair[2],
            dtree.check(tree,pair[0]),
            dtree.check(tree,pair[1])
    ]
def pruning(p1, training, validation):
    validation_best = 1 - d.check(p1[0], validation)
    for i in range(len(p1)):
        validation_current = 1 - d.check(p1[i], validation)
        if (validation_current < validation_best):
            validation_best = validation_current
    return validation_best
Beispiel #11
0
def assignment4_p1(data, attributes, fraction):
    trainData, validData = partition(data, fraction)
    dataTree = d.buildTree(trainData, attributes)
    orgErr = 1 - d.check(dataTree, validData)
    print("ORIGINAL ERR", orgErr)
    orgTree = dataTree
    bestPrunedTree = orgTree
    cont = True
    while cont:
        err = orgErr
        bestErrorRate = err
        prunedTrees = d.allPruned(bestPrunedTree)
        print(len(prunedTrees))
        for i in range(0, len(prunedTrees)):
            err = 1 - d.check(prunedTrees[i], validData)
            print(i, err)
            if err < bestErrorRate:
                bestErrorRate = err
                bestPrunedTree = prunedTrees[i]
                print("Best Error Rate:", bestPrunedTree, bestErrorRate)

        if bestErrorRate > orgErr:
            return orgTree
        elif bestPrunedTree == dataTree:
            break
        # else:
        # if bestPrunedTree == prunedTrees:
        # prunedTrees = d.allPruned(bestPrunedTree)

        orgTree = bestPrunedTree
        orgErr = bestErrorRate
Beispiel #12
0
def plotPruneAccuracy(dataset, test):

    print("Pruning accuracy results ", "\n")
    fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    fractionResults = list()

    bestFraction = fractions[0]
    bestFractionScore = 0
    for fraction in fractions:
        prune = pruning(dataset, fraction)
        result = d.check(prune, test)
        fractionResults.append(result)
        plt.plot(fraction, result, 'ro')
        if (d.check(prune, test) > bestFractionScore):
            bestFractionScore = d.check(prune, test)
            bestFraction = fraction

    plt.xlabel("Fraction")
    plt.ylabel("Classification Accuracy score on a test set")
    plt.title(
        "Classisfication accuracy of the pruned tree on test data as a function of partitioning fraction"
    )
    plt.show()

    print("Best partitioning fraction: ", bestFraction, "\n")
    print("Fraction score: ", bestFractionScore, "\n")
    return fractionResults
Beispiel #13
0
def best_pruned(base,valid_set):
	pruned = d.allPruned(base)
	best = (base,d.check(base,valid_set))
	for tree in pruned:
		perf = d.check(tree,valid_set)
		if perf >= best[1]:
			best = (tree, perf)
	return best
Beispiel #14
0
def oneprune(tree, valset):
    tree_list = [
        tr for tr in dtree.allPruned(tree)
        if dtree.check(tr, valset) > dtree.check(tree, valset)
    ]
    if len(tree_list) == 0:
        return [tree]
    return [tree for tr in tree_list for tree in oneprune(tr, valset)]
def pruning (p1):
    validation_best = 1-d.check(p1[0], monk1val)
    for i in range(len(p1)):
        validation_current = 1-d.check(p1[i], monk1val)
        if (validation_current < validation_best) :
            validation_best = validation_current
            best = p1[i]
    return best
Beispiel #16
0
def ass3():
    test = [mdata.monk1test, mdata.monk2test, mdata.monk3test]
    count = 0
    for dset in [mdata.monk1, mdata.monk2, mdata.monk3]:
        t = dtree.buildTree(dset, mdata.attributes)
        print("Training error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, dset)))
        print("Test error for set " + str(count + 1) + ": " + str(1 - dtree.check(t, test[count])))
        count = count + 1
def bestPruned(tree, validationDataset):
    bestTree = tree
    while True:
        tempTree = bestPrunedFromList(bestTree, validationDataset)
        if (dtree.check(tempTree, validationDataset) >= dtree.check(
                bestTree, validationDataset)):
            bestTree = tempTree
        else:
            return bestTree
Beispiel #18
0
def assignment4(): 
    fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    dataset = [("monk1", m.monk1, m.monk1test), ("monk3", m.monk3, m.monk3test)]
    l = []
    for f in fractions:
        extraCurry = d.check(assignment4helper(dataset[0][1], f), dataset[0][2])
        stektLok = d.check(assignment4helper(dataset[1][1], f), dataset[1][2])
        print("%.2f %.2f %.2f" % (f, 1 - extraCurry, 1 - stektLok))
        l.append(extraCurry)
Beispiel #19
0
def assignment5_id3():
    t1 = d.buildTree(m.monk1, m.attributes)
    #qt.drawTree(t1)
    print(1 - d.check(t1, m.monk1test))
    t2 = d.buildTree(m.monk2, m.attributes)
    print(1 - d.check(t2, m.monk2test))
    #qt.drawTree(t2)
    t3 = d.buildTree(m.monk3, m.attributes)
    print(1 - d.check(t3, m.monk3test))
Beispiel #20
0
Datei: lab1.py Projekt: mkufel/ML
def prune(tree, valSet):
    currentTree = tree
    currentPerf = dt.check(currentTree, valSet)
    pTrees = dt.allPruned(currentTree)
    for pTree in pTrees:
        if (dt.check(pTree, valSet) > currentPerf):
            currentTree = prune(pTree, valSet)
            currentPerf = dt.check(currentTree, valSet)
    return currentTree
Beispiel #21
0
def pruneNow(tree, data, testData):
    newVal = 1 - d.check(tree, data)

    for prunedTree in d.allPruned(tree):
        val = 1 - d.check(prunedTree, testData)
        if val < newVal:
            newVal = val

    return newVal
def getClasification(dataset,fraction):
    monk1train, monk1val = partition(dataset,fraction)
    testTree = tree.buildTree(monk1val,m.attributes)
    prunedTrees = tree.allPruned(testTree)
    pValue = 0
    for pruned in prunedTrees:
        if(tree.check(pruned,monk1train) > pValue):
            bestTree = pruned
            pValue = tree.check(pruned,monk1train)
    return pValue, bestTree
Beispiel #23
0
def prune_tree(tree, validation):
    pruned_trees = d.allPruned(tree)
    pruned_trees_performance = [0 for x in range(len(pruned_trees))]
    for candidate in pruned_trees:
        index = pruned_trees.index(candidate)
        pruned_trees_performance[index] = d.check(candidate, validation)
    if d.check(tree, validation) <= max(pruned_trees_performance):
        tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))]
        tree = prune_tree(tree, validation)
    return tree
def bestPrunedFromList(tree, validationDataset):
    listOfTrees = dtree.allPruned(tree)
    bestValue = dtree.check(tree, validationDataset)
    bestTree = listOfTrees[len(listOfTrees) - 1]
    for tree in listOfTrees:
        temp = dtree.check(tree, validationDataset)
        if temp > bestValue:
            bestValue = temp
            bestTree = tree
    return bestTree
Beispiel #25
0
def assignment4():
    fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    dataset = [("monk1", m.monk1, m.monk1test),
               ("monk3", m.monk3, m.monk3test)]
    l = []
    for f in fractions:
        extraCurry = d.check(assignment4helper(dataset[0][1], f),
                             dataset[0][2])
        stektLok = d.check(assignment4helper(dataset[1][1], f), dataset[1][2])
        print("%.2f %.2f %.2f" % (f, 1 - extraCurry, 1 - stektLok))
        l.append(extraCurry)
Beispiel #26
0
def buildTreesAndComputePerformance():
    """ Assignment 3: """
    for i in range(len(trainingSets)):
        tree = d.buildTree(trainingSets[i].dataset, m.attributes)
        performanceOnTest = d.check(tree, testSets[i].dataset)
        performanceOnTrain = d.check(tree, trainingSets[i].dataset)
        print("Error of " + trainingSets[i].name + " on " + testSets[i].name +
              ": " + str(1 - performanceOnTest))
        print("Error of " + trainingSets[i].name + " on " +
              trainingSets[i].name + ": " + str(1 - performanceOnTrain))
        print("")
Beispiel #27
0
def find_prunned(data_part, f_part):
    monk1train, monkvalue = partition(data_part, f_part)
    dtree = tree.buildTree(monk1train, dataset.attributes)
    prun_list = tree.allPruned(dtree)
    current_correctness = tree.check(dtree, monkvalue)
    for current_tree in prun_list:
        check_correctness = tree.check(current_tree, monkvalue)
        if check_correctness > current_correctness:
            current_correctness = check_correctness
            dtree = current_tree
    return dtree
Beispiel #28
0
def assignment3():
	print "--- Assignment 3 ---"
	print "Performance of the decision trees"
	table = Texttable(max_width=100)
	table.add_row(["Dataset", "Training", "Test"])
	for i in range(3):
		tree = d.buildTree(monkdata[i],m.attributes)
		perf = [d.check(tree, monkdata[i]), d.check(tree, testdata[i])]
		table.add_row(["Monk-" + str(i+1)] + perf)
	print table.draw()
	print
Beispiel #29
0
def test_pruning(dataset, testset):
    fraction_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    print ("TESTING PRUNING")
    for fraction in fraction_list:
        print("--------------")
        print(fraction)
        monk_tree = d.buildTree(dataset,m.attributes)
        training, validation = partition(dataset, fraction)
        pruned_monk_tree = prune_tree(monk_tree,validation)
        print(d.check(monk_tree, testset))
        print(d.check(pruned_monk_tree, testset))
        print("--------------")
Beispiel #30
0
def prunedTree(training, validation):
    tree = dtree.buildTree(training, m.attributes)
    poss = dtree.allPruned(tree)
    scores = []
    for i in range(len(poss)):
        scores.append(dtree.check(poss[i], validation))
    while max(scores) >= dtree.check(tree, validation):
        tree = poss[scores.index(max(scores))]
        poss = dtree.allPruned(tree)
        scores = []
        for i in range(len(poss)):
            scores.append(dtree.check(poss[i], validation))
    return tree
Beispiel #31
0
def checkperformance(tree, monk1val):
    pruned_trees = d.allPruned(tree)
    t1_better_performance = -1
    best_tree = None
    for t in pruned_trees:
        if t1_better_performance < d.check(t, monk1val):
            t1_better_performance = d.check(t, monk1val)
            best_tree = t

    if t1_better_performance >= d.check(tree, monk1val):
        return checkperformance(best_tree, monk1val)

    return tree
Beispiel #32
0
def prune_tree(monkdata_set, num_trials=50):
    """ 
        Randomizes data and then splits into partitions based on partition_fractions
        Creates a tree based on the first partition (training data)
        Prunes that tree multiple times to see effect of pruning and partition on accuracy 
        Returns a dict with partition_fraction mapped to best accuracy list

        :param monkdata_set: monkdata set from monkdata.py
        :param num_trials: number of trials to run

        :returns dict: partition_fraction mapped to a list of tuples
    """

    partition_fractions = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

    #Key: partition_fraction. Value: list of max accuracy in the pruning
    partition_accuracy = OrderedDict()

    for i in range(0, num_trials):

        for partition_fraction in partition_fractions:

            monk_training, monk_validation = partition(monkdata_set,
                                                       partition_fraction)
            tree = dtree.buildTree(monk_training, monkdata.attributes)
            accuracy = dtree.check(tree, monk_validation)

            prune_counter = 0
            max_accuracy = accuracy
            max_accuracy_prune = 0

            pruned_trees = dtree.allPruned(tree)

            for pruned_tree in pruned_trees:
                prune_counter += 1
                pruned_accuracy = dtree.check(pruned_tree, monk_validation)

                #Keep track of the largest prune_accuracy and number
                if pruned_accuracy > max_accuracy:
                    max_accuracy = pruned_accuracy
                    max_accuracy_prune = prune_counter

            #If we haven't stored the fraction yet, create a new array
            if not partition_fraction in partition_accuracy:
                partition_accuracy[partition_fraction] = list()

            #Add our most recent trial result there
            prune_result = (max_accuracy_prune, max_accuracy)
            partition_accuracy[partition_fraction].append(prune_result)

    return partition_accuracy
Beispiel #33
0
def optimisePartitions3():  #runs
    tree3 = d.buildTree(m.monk3, m.attributes)
    score3 = d.check(tree3, m.monk3test)
    print("Performance of monk3 tree: " + str(score3) + "\n")
    for index, partition in enumerate(partitions):
        for j in range(runs):
            train3, val3 = d.partition(m.monk3, partition)
            tree3a = d.buildTree(train3, m.attributes)
            best3 = bestPrunedTree(tree3a, val3)
            bigList3.append(1 - d.check(best3, m.monk3test))
        errorList3.append(sum(bigList3) / len(bigList3))
        varianceList3.append(variance(bigList3, errorList3[index]))

    return errorList3, varianceList3
Beispiel #34
0
def optimisePartitions1():  #runs
    tree1 = d.buildTree(m.monk1, m.attributes)
    score1 = d.check(tree1, m.monk1test)
    print("Performance of monk1 tree: " + str(score1) + "\n")
    for index, partition in enumerate(partitions):
        for j in range(runs):
            train1, val3 = d.partition(m.monk1, partition)
            tree1a = d.buildTree(train1, m.attributes)
            best1 = bestPrunedTree(tree1a, val3)
            bigList1.append(1 - d.check(best1, m.monk1test))
        errorList1.append(sum(bigList1) / len(bigList1))
        varianceList1.append(variance(bigList1, errorList1[index]))

    return errorList1, varianceList1
Beispiel #35
0
def assignment_7(monktrain, monkval):
    t = dtree.buildTree(monktrain, m.attributes)
    p1 = performance = dtree.check(t, monkval)
    better_found = True
    while better_found:
        prunes = dtree.allPruned(t)
        better_found = False
        for prune in prunes:
            tmp_performance = dtree.check(prune, monkval)
            if tmp_performance > performance:
                t = prune
                performance = tmp_performance
                better_found = True
    return p1, dtree.check(t, monkval)
def findBestPrune(tree, validationdata):
    prunedtree = d.allPruned(tree)
    besttree = tree
    bestperformance = d.check(besttree, validationdata)
    for candidatetree in prunedtree:
        candidateperformance = d.check(candidatetree, validationdata)
        # just take greater because all prunes returns the original tree as well?
        if (candidateperformance > bestperformance):
            besttree = candidatetree
            bestperformance = candidateperformance
    if besttree == tree:
        return tree
    else:
        return findBestPrune(besttree, validationdata)
Beispiel #37
0
def pruneTree(trainSet, fraction):
    monktrain, monkval = partition(trainSet, fraction)
    bestTree = dtree.buildTree(monktrain, m.attributes)
    treePermutations = dtree.allPruned(bestTree)


    bestVal = dtree.check(bestTree, monkval)

    for treeP in treePermutations:
        treePerformance = dtree.check(treeP, monkval)
        if (treePerformance > bestVal):
            bestTree = treeP
            bestVal = treePerformance
    return bestVal, bestTree, monkval
Beispiel #38
0
def find_best_pruned_tree(tree, validate):
    best_perf = d.check(tree, validate)
    forest = d.allPruned(tree)

    temp_tree = None
    best_tree = tree

    for t in forest:
        temp_perf = d.check(t, validate)
        if temp_perf > best_perf:
            best_perf = temp_perf
            best_tree = tree

    return best_tree, best_perf
Beispiel #39
0
def best_pruned_tree(dataset, fraction):
    train, val = partition(dataset, fraction)
    tree = dt.buildTree(train, m.attributes)
    improved = True
    while improved:
        improved = False
        best_performance = dt.check(tree, val)
        for pruned_tree in dt.allPruned(tree):
            performance = dt.check(pruned_tree, val)
            if performance > best_performance:
                best_performance = performance
                tree = pruned_tree
                improved = True
    return tree
Beispiel #40
0
def best_pruned_tree(dataset, fraction):
    train, val = partition(dataset, fraction)
    tree = dt.buildTree(train, m.attributes)
    improved = True
    while improved:
        improved = False
        best_performance = dt.check(tree, val)
        for pruned_tree in dt.allPruned(tree):
            performance = dt.check(pruned_tree, val)
            if performance > best_performance:
                best_performance = performance
                tree = pruned_tree
                improved = True
    return tree
Beispiel #41
0
def check_correct_incorrect_classification(datasets, test_datasets,
                                           datasets_names):
    datasets_trees = perform_buildTree(datasets)
    check = {}
    check_e = np.zeros((len(datasets), 2))
    for i, dataset, dataset_name, dataset_tree, test_dataset in zip(
            range(len(datasets)), datasets, datasets_names, datasets_trees,
            test_datasets):
        correct_classification = round(d.check(dataset_tree, test_dataset), 3)
        check[dataset_name] = correct_classification
        err = round(1 - d.check(dataset_tree, dataset), 3), round(
            (1 - correct_classification), 3)
        check_e[i] = err
    return check, check_e
def prune_tree(tree, validation_set):
    cur_tree = tree
    while 1:
        alternatives = dtree.allPruned(cur_tree)
        best_acc = dtree.check(cur_tree, validation_set)
        best_alt = cur_tree
        for alt in alternatives:
            alt_acc = dtree.check(alt, validation_set)
            if alt_acc >= best_acc:
                best_acc = alt_acc
                best_alt = alt
        if best_alt == cur_tree:
            return cur_tree
        cur_tree = best_alt
Beispiel #43
0
def A3():
  t1 = dT.buildTree( m.monk1, m.attributes )
  print( dT.check( t1, m.monk1test ) )
  print( dT.check( t1, m.monk1 ) )
  print '\n'
  #draw.drawTree( t1 )

  t2 = dT.buildTree( m.monk2, m.attributes )
  print( dT.check( t2, m.monk2test ) )
  print '\n'
  #draw.drawTree( t2 )

  t3 = dT.buildTree( m.monk3, m.attributes )
  print( dT.check( t3, m.monk3test ) )
  print '\n'
Beispiel #44
0
def prune(t, val):
	bestTree = t
	bestPerf = d.check(t, val)
	found = True

	while(found):
		found = False
		trees = d.allPruned(bestTree)
		for tree in trees:
			perf = d.check(tree, val)
			if(perf >= bestPerf):
				bestTree = tree
				bestPerf = perf
				found = True
	return bestTree
Beispiel #45
0
def prune(pruned_tree, test_tree):
    currentBase = pruned_tree
    oldVal = 0
    maxVal = 1
    while maxVal > oldVal:
        maxVal = dt.check(currentBase, test_tree)
        oldVal = maxVal
        maxTree = currentBase
        for pTree in dt.allPruned(currentBase):
            temp = dt.check(pTree, test_tree)
            if temp > maxVal:
                maxVal = temp
                maxTree = pTree
        currentBase = maxTree
    return maxTree
Beispiel #46
0
def prun(tree, val):
    candidates = {}
    pruns = dt.allPruned(tree)
    for p in pruns:
        performance = dt.check(p, val)
        candidates[p] = performance
    return candidates
Beispiel #47
0
def check_tree_performance(tree, testset):
        '''
        Builds a dec tree from dataset and 
        checks the performance on test set
        '''
        performance_test = dt.check(tree, testset)
        return performance_test
Beispiel #48
0
def getPrunedChildren(toPrune, bestErrorRate, validData):
    bestPrunedTreesGrandChildren = []
    for bestPrunedTreeIndex in range(0, len(toPrune)):
        # print(toPrune[bestPrunedTreeIndex])
        prunedTreesChildren = []
        prunedTreesChildren = d.allPruned(toPrune[bestPrunedTreeIndex])
        # print(len(prunedTreesChildren))
        notFound = False
        for i in range(0, len(prunedTreesChildren)):
            tempPrunedTreesGrandChildren = []
            err = 1 - d.check(prunedTreesChildren[i], validData)
            # print(i, err)
            if err <= bestErrorRate:
                # bestErrorRate = err
                tempPrunedTreesGrandChildren.append(getPrunedChildren([prunedTreesChildren[i]], err, validData))
            else:
                notFound = True

                # print("Best Error Rate:", prunedTreesChildren[i], bestErrorRate)
                # print(len(tempPrunedTreesGrandChildren))
        if notFound:
            tempPrunedTreesGrandChildren.append(toPrune[bestPrunedTreeIndex])
        bestPrunedTreesGrandChildren += tempPrunedTreesGrandChildren
        # print(len(bestPrunedTreesGrandChildren))
    return bestPrunedTreesGrandChildren
Beispiel #49
0
def pruned_tree_performance(training_set, test_set, fraction):
    total_performance = 0
    for i in range(AGGREGATE_TIMES):
        tree = best_pruned_tree(training_set, fraction)
        total_performance += dt.check(tree, test_set)
    average_performance = total_performance / AGGREGATE_TIMES
    return average_performance
Beispiel #50
0
def pruneTree(tree, validation):
    run = True
    bestGain = 0
    prunedTrees = d.allPruned(tree)

    while run:
        currentgain = 0
        maxgain = 0
        besttree = 0
        #print("Number of possible prunings: %d" % len(prunedTrees))

        for x in range(0, len(prunedTrees)):
            currentgain = d.check(prunedTrees[x], validation)
            #print("Rate for tree %d: %f " % (x + 1, currentgain))
            if(currentgain > maxgain):
                maxgain = currentgain;
                bestTree = prunedTrees[x]

        prunedTrees = d.allPruned(bestTree)

        if(maxgain > bestGain):
            bestGain = maxgain
        else:
            run = False
            #print("Max accuracy reached. Pruning stopped.")
            #print("Best accuracy: %f" % bestGain);

    return bestTree
Beispiel #51
0
def assignment4():
	print "--- Assignment 4 ---"
	print "Selecting the best fraction to divide training and validation sets for pruning"
	
	table = Texttable(max_width=100)
	table.add_row(["Dataset", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "Benchmark"])
	for i in range(3):
		row = ["Monk-" + str(i+1)]
		for frac in [(x * 0.1) for x in range(3,9)]:
			train_set, valid_set = m.partition(monkdata[i], frac)
			base = d.buildTree(train_set,m.attributes)
			best = best_pruned(base,valid_set)
			true_perf = d.check(best[0],testdata[i])
			row += [true_perf]
		row += [d.check(d.buildTree(monkdata[i],m.attributes),testdata[i])]
		table.add_row(row)
	print table.draw()
	print					
Beispiel #52
0
def findPrunned(t, monk1val1)  : 
               t2=[]
               t2 = d.allPruned(t)
               
               maxi1 = d.check(t,monk1val1)
               maxi2 = maxi1
               
               for s in t2:
                     val = d.check(s,monk1val1) 
                     
                     if val < maxi1 :
                          maxi1 = val
                          answertree = s
               if maxi1 == maxi2 :
                     answertree = t 
                     efficiency.append(maxi1)
                     print  maxi1
                     return maxi1
               else :
                    x =  findPrunned(answertree,monk1val1)     
Beispiel #53
0
def pruning( trainingSet, testSet, fraction ):
  train1, train2 = partition( trainingSet, fraction )

  bestTree = dT.buildTree( train1, m.attributes )
  bestTreePerf = dT.check( bestTree, train2 )
  bestTreeFound = True

  while bestTreeFound == True:
    bestTreeFound = False

    prunedTrees = dT.allPruned( bestTree )

    for candidateTree in prunedTrees:

      if dT.check( candidateTree, train2 ) >= bestTreePerf:
        bestTree = candidateTree
        bestTreePerf = dT.check( candidateTree, train2 )
        bestTreeFound = True

  return dT.check( bestTree, testSet )
Beispiel #54
0
def findBestPrune(tree, validationSet):
#    print("tree")
#    print(tree)
    current=tree
    while True:
        currentPerformance=dtree.check(current, validationSet)	
        pruned=dtree.allPruned(current)	
        if pruned == ():
            break
#        print("current")
#        print(current)
#        print("pruned trees")
#        print(len(pruned))
        performances=map(lambda t : dtree.check(t, validationSet), pruned)
        best, i=max(izip(performances,count())) 
        # ask which trees we should pick when performance is equal? min depth, min average depth, min no of nodes, order in allPruned
        if best < currentPerformance:
            break
        current = pruned[i]
    return current		 
Beispiel #55
0
def generateErrorTable(dataset, testset, fractions, tries):
    result=[]	
    for x in fractions:
        acc = 0
        for i in range(tries):
            trainSet, valSet =partition(dataset, x)

            tree = dtree.buildTree(trainSet, m.attributes)
            prunedTree = findBestPrune(tree, valSet)
            acc += dtree.check(prunedTree, testset)
        result.append( (x,acc / tries) )
    return result
Beispiel #56
0
def prune(tree, testdata, performance_ref):
        #Prunes tree from given test data
        alternatives = dt.allPruned(tree)
        best_per = 0
        best_tree = None
        for subtree in alternatives:
                performance = dt.check(subtree, testdata)
                if performance > best_per:
                        best_per = performance
                        best_tree = subtree
        if best_per >= performance_ref:
                return prune(best_tree, testdata, performance_ref)
        else:
                return tree
Beispiel #57
0
def findBestTree(tree, compare, lastBest=0, lastBestTree=None):
	bestTree = lastBestTree
	bestVal = lastBest

	for p in d.allPruned(tree):
		val = d.check(p, compare)
		if val > bestVal:
			bestTree = p
			bestVal = val

	if(bestVal > lastBest):
		return findBestTree(bestTree, compare, bestVal, bestTree)
	else:
		return bestTree
Beispiel #58
0
def build_tree():
  print "\n------------------------------\nAssignment 3 - Error\n------------------------------"
  tree = dt.buildTree(data.monk1, data.attributes)
#drawtree.drawTree(tree)
  print "Dataset\tE(train)\tE(test)"
  print "Monk1:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk1), 1-dt.check(tree, data.monk1test))
  tree = dt.buildTree(data.monk2, data.attributes)
  print "Monk2:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk2), 1-dt.check(tree, data.monk2test))
  tree = dt.buildTree(data.monk3, data.attributes)
  print "Monk3:\t%.6f\t%.6f" % (1-dt.check(tree, data.monk3), 1-dt.check(tree, data.monk3test))
Beispiel #59
0
def prune():
  print "\n------------------------------\nAssignment 4 - Pruning\n------------------------------"
  print "Dataset\t  0.3\t\t  0.4\t\t  0.5\t\t  0.6\t\t  0.7\t\t  0.8"
  partSizes = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
  r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
  i = 0
  for size in partSizes:  
    for j in range(100):
      training, test = partition(data.monk1, size)
      bestTree = dt.buildTree(training, data.attributes)
      bestClass = dt.check(bestTree, test)
      better = True
      while better:
        better = False
        for subTree in dt.allPruned(bestTree):
          if dt.check(subTree, test) > bestClass:
            bestTree = subTree
            bestClass = dt.check(subTree, test)
            better = True
      r[i] += (1-dt.check(bestTree, data.monk1test))
    i += 1
  print "Monk1\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100)
  r = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
  i = 0
  for size in partSizes:  
    for j in range(100):
      training, test = partition(data.monk3, size)
      bestTree = dt.buildTree(training, data.attributes)
      bestClass = dt.check(bestTree, test)
      better = True
      while better:
        better = False
        for subTree in dt.allPruned(bestTree):
          if dt.check(subTree, test) >= bestClass:
            bestTree = subTree
            bestClass = dt.check(subTree, test)
            better = True
      r[i] += (1-dt.check(bestTree, data.monk3test))
    i += 1
  print "Monk3\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t%0.6f\t" % (r[0]/100, r[1]/100, r[2]/100, r[3]/100, r[4]/100, r[5]/100)
Beispiel #60
0
def assignment4helper(dataset, fraction):
    monk1train, monk1val = partition(dataset, fraction)
    tree = d.buildTree(monk1train, m.attributes)

    bestTree = None
    maxVal = -1
    cont = True
    i = 0
    while (cont):
        cont = False
        i += 1
        for t in d.allPruned(tree):
            val = d.check(t, monk1val)
            if (val > maxVal):
                cont = True
                bestTree = t
                maxVal = val
        tree = bestTree
    # print("#iterations: %d" % i)
    return tree