Beispiel #1
0
def get_best_tree(currtree):
    found_better_tree = False

    for newtree in dtree.allPruned(currtree):
        if dtree.check(newtree, monkval) > dtree.check(currtree, monkval):
            found_better_tree = True
            currtree = newtree

    if found_better_tree:
        currtree = get_best_tree(currtree)
    return currtree
def pick_best_tree(old_tree, validation_set):
    list_of_trees = dec.allPruned(old_tree)
    old_accuracy = dec.check(old_tree, validation_set)
    validation_accuracy = 0
    result = None
    for i in range(len(list_of_trees)):
        temp = list_of_trees[i]
        temp_accuracy = dec.check(temp, validation_set)
        if temp_accuracy > validation_accuracy:
            validation_accuracy = temp_accuracy
            result = temp
    if validation_accuracy < old_accuracy:
        return old_tree
    else:
        pick_best_tree(result, validation_set)
        return result
def optimum_prune(tree, val_data):
    def get_local_opt(tp):
        opt = (None, 0)
        for tree, perf in tp:
            opt = (tree, perf) if perf >= opt[1] else opt
        return opt

    optimum = (None, 0)
    while True:
        pruned_trees = allPruned(tree)
        performance = [(t, check(t, val_data)) for t in pruned_trees]
        local_opt = get_local_opt(performance)

        #print('Current optimum: {}, new optimum: {}'.format(optimum[1], local_opt[1]))

        if local_opt[1] > optimum[1]:
            optimum = local_opt
        else:
            break

    return optimum
def assignment_5():

    print("*** ASSIGNMENT 5 ***")

    t_monk1 = buildTree(monk1, attributes)
    t_monk2 = buildTree(monk2, attributes)
    t_monk3 = buildTree(monk3, attributes)

    result_text = "{} -- E_train: {}; E_test: {}"

    print(
        result_text.format('MONK1', 1.0 - check(t_monk1, monk1),
                           1.0 - check(t_monk1, monk1test)))
    print(
        result_text.format('MONK2', 1.0 - check(t_monk2, monk2),
                           1.0 - check(t_monk2, monk2test)))
    print(
        result_text.format('MONK3', 1.0 - check(t_monk3, monk3),
                           1.0 - check(t_monk3, monk3test)))

    print("\n")
Beispiel #5
0
    # print(s)

    # Onwards
    # tree = dtree.buildTree(monkdata.monk1, monkdata.attributes, maxdepth=1)  # Two levels
    # tree = dtree.buildTree(monkdata.monk1, monkdata.attributes)  # All levels
    # drawtree.drawTree(tree)  # Show tree

    # PERFORMANCE CHECK PART
    name_sets = ('MONK-1', 'MONK-2', 'MONK-3')
    training_sets = (monkdata.monk1, monkdata.monk2, monkdata.monk3)
    test_sets = (monkdata.monk1test, monkdata.monk2test, monkdata.monk3test)

    trees = list(
        dtree.buildTree(training_set, monkdata.attributes)
        for training_set in training_sets)

    print('# Performance Check')

    header = ['Dataset', 'Train', 'Test']
    data = []

    for tree, name_set, training_set, test_set in zip(trees, name_sets,
                                                      training_sets,
                                                      test_sets):
        data.append([
            name_set, 1 - round(dtree.check(tree, training_set), 5),
            1 - round(dtree.check(tree, test_set), 5)
        ])

    print(tabulate(data, header))
Beispiel #6
0
                                                   datasets_test):

        data = []
        mean_errors = []
        stdev = []

        for fraction in fractions:

            errors = []

            for i in range(n):
                monktrain, monkval = partition(dataset, fraction)
                built_tree = dtree.buildTree(monktrain, m.attributes)
                best_tree = get_best_tree(built_tree)

                errors.append(1 - dtree.check(best_tree, dataset_test))

            mean_error = round(statistics.mean(errors), decimals)
            mean_errors.append(round(statistics.mean(errors), decimals))

            stdev.append(round(statistics.stdev(errors), decimals))

            data.append([fraction, mean_error, statistics.mean(stdev)])

        print(tabulate(data, header), '\n')

        plt.errorbar(fractions, mean_errors, yerr=stdev, marker='o')

        plt.title('{} (n = {})'.format(dataset_name, n))
        plt.xlabel('fraction')
        plt.ylabel('mean error')
def main():
    monk1 = m.monk1
    monk2 = m.monk2
    monk3 = m.monk3
    entropy = dec.entropy(m.monk1)
    print("MONK 1: ", entropy)
    entropy = dec.entropy(m.monk2)
    print("MONK 2: ", entropy)
    entropy = dec.entropy(m.monk3)
    print("MONK 3: ", entropy)

    print(dec.bestAttribute(monk1, m.attributes))
    print(("MONK1:"))
    for i in range(6):
        print("Information gain of a" + str(i + 1) + " is " +
              str(dec.averageGain(monk1, m.attributes[i])))
    print(("MONK2:"))
    for i in range(6):
        print("Information gain of a" + str(i + 1) + " is " +
              str(dec.averageGain(monk2, m.attributes[i])))
    print(("MONK3:"))
    for i in range(6):
        print("Information gain of a" + str(i + 1) + " is " +
              str(dec.averageGain(monk3, m.attributes[i])))

    print(("MONK1:"))
    tree = dec.buildTree(monk1, m.attributes)
    print("Training Error: ", 1 - dec.check(tree, m.monk1))
    print("Test Error: ", 1 - dec.check(tree, m.monk1test))
    print(("MONK2:"))
    tree = dec.buildTree(monk2, m.attributes)
    print("Training Error: ", 1 - dec.check(tree, m.monk2))
    print("Test Error: ", 1 - dec.check(tree, m.monk2test))
    print(("MONK3:"))
    tree = dec.buildTree(monk3, m.attributes)
    print("Training Error: ", 1 - dec.check(tree, m.monk3))
    print("Test Error: ", 1 - dec.check(tree, m.monk3test))

    def partition(data, fraction):
        ldata = list(data)
        random.shuffle(ldata)
        breakPoint = int(len(ldata) * fraction)
        return ldata[:breakPoint], ldata[breakPoint:]

    values = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    best_test_error = 0
    model_score = []
    best_fraction = 0
    for fraction in values:
        total_test_error = 0
        for j in range(1000):
            monk3train, monk3val = partition(m.monk3, fraction)
            tree = dec.buildTree(monk3train, m.attributes)
            result = pick_best_tree(tree, monk3val)
            total_test_error += dec.check(result, m.monk3test)
        avg_test_error = total_test_error / 1000
        model_score.append(avg_test_error)
        if avg_test_error > best_test_error:
            best_test_error = avg_test_error
            best_fraction = fraction
    plt.scatter(values, model_score)
    plt.xlabel("Split fraction")
    plt.ylabel("Test accuracy")
    plt.savefig("Monk3.png")
    plt.show()

    best_test_error = 0
    model_score = []
    best_fraction = 0
    for fraction in values:
        total_test_error = 0
        for j in range(1000):
            monk1train, monk1val = partition(m.monk1, fraction)
            tree = dec.buildTree(monk1train, m.attributes)
            result = pick_best_tree(tree, monk1val)
            total_test_error += dec.check(result, m.monk1test)
        avg_test_error = total_test_error / 1000
        model_score.append(avg_test_error)
        if avg_test_error > best_test_error:
            best_test_error = avg_test_error
            best_fraction = fraction
    plt.scatter(values, model_score)
    plt.xlabel("Split fraction")
    plt.ylabel("Test accuracy")
    plt.savefig("Monk1.png")
    plt.show()

    print(best_fraction)
Beispiel #8
0
raw.set_title('Mean and raw values')
raw.set_xlabel('fraction')
raw.set_ylabel('error')
stat = plt.subplot(2, 1, 2)
stat.set_title('Standard deviation')
stat.set_xlabel('fraction')
stat.set_ylabel('standard deviation')

for j in range(3, 9):
    print j
    total = 0
    for k in range(size):
        monk1train, monk1val = partition(m.monk3, j / 10.0)

        t = d.buildTree(monk1train, m.attributes)
        checkT = d.check(t, monk1val)

        while True:
            hold = checkT
            tprune = d.allPruned(t)

            for i in tprune:
                temp = (d.check(i, monk1val))
                if (temp > checkT):
                    checkT = temp
                    t = i
                    #print checkT

            if (checkT == hold):
                break
Beispiel #9
0
from python import dtree as d
from python import monkdata as m


print("Error rate MONK 1")
t= d.buildTree(m.monk1, m.attributes)
print(1-d.check(t,m.monk1test))

print("Error rate MONK 2")
t= d.buildTree(m.monk2, m.attributes)
print(1-d.check(t,m.monk2test))

print("Error rate MONK 3")
t= d.buildTree(m.monk3, m.attributes)
print(1-d.check(t,m.monk3test))
def assignment_7():

    print("*** ASSIGNMENT 7 ***")

    samples = 100
    fractions = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.925, 0.95,
                 0.99)
    datasets = {
        'monk1': {
            'training': monk1,
            'test': monk1test
        },
        'monk3': {
            'training': monk3,
            'test': monk3test
        }
    }

    results = {}

    for dataset_name, dataset_data in datasets.items():
        dataset = dataset_data['training']
        dataset_test = dataset_data['test']
        results[dataset_name] = {}

        for fraction in fractions:
            errors = []
            for _ in range(samples):
                train, validation = partition(dataset, fraction)
                tree = buildTree(train, attributes)
                opt_tree, _ = optimum_prune(tree, validation)
                errors.append(1.0 - check(opt_tree, dataset_test))

            results[dataset_name][fraction] = {
                'mean': np.mean(errors),
                'median': np.median(errors),
                'std': np.std(errors),
                'max': max(errors),
                'min': min(errors)
            }

    pp = PrettyPrinter(indent=4)
    pp.pprint(results)

    y_monk1 = [(stats['mean'], stats['std'])
               for fraction, stats in results['monk1'].items()]
    y_monk3 = [(stats['mean'], stats['std'])
               for fraction, stats in results['monk3'].items()]

    plt.figure()
    plt.errorbar(fractions, [e[0] for e in y_monk1],
                 yerr=[e[1] for e in y_monk1],
                 fmt='or',
                 capsize=5,
                 label='MONK-1')
    plt.errorbar(fractions, [e[0] for e in y_monk3],
                 yerr=[e[1] for e in y_monk3],
                 fmt='ob',
                 capsize=5,
                 label='MONK-3')
    plt.xlabel('Pruning fraction size (relative size of training set)')
    plt.ylabel('Classification error')
    plt.legend()
    plt.title(
        'Error vs. fraction size (mean of {} samples, errorbars represent one standard deviation)'
        .format(samples))
    plt.show()