Ejemplo n.º 1
0
    print(max_attr)
    for value, branch_data in branches_dict.items():
        branches_nodes[value] = buildtree(branch_data, _remaining_attr,
                                          level + 1)

    return dtree.TreeNode(max_attr, branches_nodes,
                          dtree.TreeLeaf(dtree.mostCommon(dataset)))


# t = buildtree(m.monk1, m.attributes, 0)

# drawtree_qt5.drawTree(t)

t = dtree.buildTree(m.monk2, m.attributes)

drawtree_qt5.drawTree(t)

# Part 5:
# for name, dset in datasets.items():
#     t = dtree.buildTree(dset, m.attributes)

#     print(name, "train", round(1 - dtree.check(t, dset), 6), end=" ")
#     print(name, "test", round(1 - dtree.check(t, testsets[name]), 6))
# exit()

## PART 6
import random


def partition(data, fraction):
    ldata = list(data)
Ejemplo n.º 2
0
    inf_gain.append(asdf)
del asdf
print("Information Gain")
pprint.table(inf_gain)
print("\n")


#4 Find the attribute that gives the max information gain.
def find_max_gain(gain_matrix):
    max = 0.0
    out = None
    for monk, row in enumerate(inf_gain):
        for attr, gain in enumerate(row):
            if gain > max:
                max = gain
                out = [monk + 1, attr + 1]
    if out is None:
        return [None, None, None]
    else:
        out.append(max)
        return out


max_gain = find_max_gain(inf_gain)
print("Max gain is at [monk, attr, value] = " + str(max_gain))

t1 = dt.buildTree(m.monk1, m.attributes)
print(1 - dt.check(t1, m.monk1test))  # 0.8287037037037037
print(1 - dt.check(t1, m.monk1))  # 1.0
drawT.drawTree(t1)
Ejemplo n.º 3
0
print 'train set error: ' + str(round(
    (1 - d.check(t2, m.monk2)) * 100, 2)) + '%'
print 'test set error: ' + str(round(
    (1 - d.check(t2, m.monk2test)) * 100, 2)) + '%'
print ''
print 'decision tree of monk3:'
print(t3)
print 'train set error: ' + str(round(
    (1 - d.check(t3, m.monk3)) * 100, 2)) + '%'
print 'test set error: ' + str(round(
    (1 - d.check(t3, m.monk3test)) * 100, 2)) + '%'
print ''

#dqt.drawTree(t1)
#dqt.drawTree(t2)
dqt.drawTree(t3)

## Assignment 7: reduced error pruning
fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
err_test1, var_test1 = d.errReducedPruned(m.monk1, m.monk1test, m.attributes,
                                          fraction, 500)
err_test3, var_test3 = d.errReducedPruned(m.monk3, m.monk3test, m.attributes,
                                          fraction, 500)

# plot results
plt.figure(4)
plt.subplot(2, 1, 1)
plt.title('Mean Error vs. Fraction - 500 runs')
plt.xlabel('Fraction')
plt.ylabel('Mean Error')
line1, = plt.plot(fraction, err_test1, 'bo-', label="monk 1")
Ejemplo n.º 4
0
BestAttribute = dtree.bestAttribute(monk3, m.attributes)
print(BestAttribute)

#monk1_A5_1 = dtree.select(monk1,m.attributes[5],1)
#print(monk1_A5_1)

monk1_tree = dtree.buildTree(monk1, m.attributes)
#graf1 = dt.drawTree(monk1_tree)

monk2_tree = dtree.buildTree(monk2, m.attributes)
#graf2 = dt.drawTree(monk2_tree)

monk3_tree = dtree.buildTree(monk3, m.attributes)
#graf3 = dt.drawTree(monk3_tree)

print(dtree.check(monk1_tree, m.monk1))
print(dtree.check(monk1_tree, m.monk1test))
print(dtree.check(monk2_tree, m.monk2test))
print(dtree.check(monk3_tree, m.monk3test))


def partition(data, fraction):
    ldata = list(data)
    random.shuffle(ldata)
    breakPoint = int(len(ldata) * fraction)
    return ldata[:breakPoint], ldata[breakpoint:]


print(dtree.allPruned(monk1_tree))
dt.drawTree(dtree.allPruned()[1])
# FOR MONK-1
t1=d.buildTree(m.monk1, m.attributes);
print("The error in the training set is {} and the error in the testing set is {} for MONK-1".format(1-d.check(t1, m.monk1), 1-d.check(t1, m.monk1test)))

# FOR MONK-2
t2=d.buildTree(m.monk2, m.attributes);

print("The error in the training set is {} and the error in the testing set is {} for MONK-2".format(1-d.check(t2, m.monk2), 1-d.check(t2, m.monk2test)))

## FOR MONK-3
t3=d.buildTree(m.monk3, m.attributes);

print("The error in the training set is {} and the error in the testing set is {} for MONK-3".format(1-d.check(t3, m.monk3), 1-d.check(t3, m.monk3test)))

# TREE DRAWING
d5.drawTree(t1)
d5.drawTree(t2)
d5.drawTree(t3)

# PRUNING

import random
import monkdata as m
import dtree as d

from dtree import allPruned

def partition(data, fraction):
    ldata = list(data)
    random.shuffle(ldata)
    breakPoint = int(len(ldata) * fraction)
Ejemplo n.º 6
0
for dataset in datasets:
    print "Dataset:             ", dataset['name']
    decisionTree = None
    for maximumDepth in [2, 5, 10000]:
        decisionTree = dtree.buildTree(dataset['ref'], m.attributes,
                                       maximumDepth)
        print "Depth", maximumDepth
        print "   Check Training Set:  ", dtree.check(decisionTree,
                                                      dataset['ref'])
        print "   Check Test Set:      ", dtree.check(decisionTree,
                                                      dataset['test'])
        print "   Decision Treee:\n    ", decisionTree

    # uncomment if you want to see the tree or move to loop if you
    # want to see trees at all levels
    drawLib.drawTree(decisionTree)

# Below is a try to design naive implementation of dtree.buildTree
# using mostCommon, select and bestAttribute

# get the best attribute in terms of information gain
# split the dataset based on the values of the the selected attribute
# we will have as many branches as the attribute values.
# steps = []
# step = 0
# maxDepth = 2

# def split(subset, attribute, value):
#     print "Step:        ", step
#     print "Attribute:   ", attribute
#     print "Value:       ", value
Ejemplo n.º 7
0
    perf, dec_tree = prune(dec_tree, val_data)

    #Evaluate performace

    #On test dataset
    test_perf = misClasRate(dec_tree, train_data)

    #On trainingt dataset
    train_perf = misClasRate(dec_tree, dataset[k])

    print("Dataset %d\n\tTest misc rate: %f\n\tTrain misc rate: %f" %
          (k + 1, test_perf, train_perf))

    #Plot the Tree
    if k == 2:
        drawTree(dec_tree)
'''
Accuracy

Dataset 1
        Test perf: 0.828704
        Train perf: 1.000000
Dataset 2
        Test perf: 0.692130
        Train perf: 1.000000
Dataset 3
        Test perf: 0.944444
        Train perf: 1.000000
'''
'''
Misclassification rate
monk1train, monk1val = partition(m.monk3, fraction)
print d.buildTree(m.monk3, m.attributes)


def prunecont(tree, maxp):
    alt = d.allPruned(tree)
    prunecheck = range(len(alt))
    maxprune = 0
    indx = 999
    for x in range(len(alt)):
        prunecheck[x] = d.check(alt[x], monk1val)
        if prunecheck[x] >= maxprune:
            maxprune = prunecheck[x]
            indx = x + 1
    if maxprune >= maxp:
        print maxprune
        plt.figure()
        plt.plot(range(1, len(alt) + 1), prunecheck)
        plt.title('MONK-1')
        plt.xlabel('Pruning Alternatives')
        plt.ylabel('Pruning Accuracy')
        plt.show()
        return prunecont(alt[indx - 1], maxprune)
    else:
        return tree


finaltree = prunecont(d.buildTree(monk1train, m.attributes), 0)
print finaltree
dt.drawTree(finaltree)
import monkdata as m
import dtree as d
import drawtree_qt5 as dt

dt.drawTree(d.buildTree(m.monk3, m.attributes))
Ejemplo n.º 10
0
            # print("Next level information gains:")
            # for i in range(6):
            #     gain = dt.averageGain(monk, m.attributes[i])
            #     print("A" + str(i+1) + ": " + str(gain))
    print("")


    best_atribute = dt.bestAttribute(m.monk1, m.attributes)
    for value in best_atribute.values:
        subset = dt.select(m.monk1, best_atribute, value)
        entropy = dt.entropy(subset)
        print("Attribute value:" + str(value))
        for i in range(6):
            gain = dt.averageGain(subset, m.attributes[i])
            print("A" + str(i+1) + ": " + str(gain))
    print("")

    # Assignment 5
    best_atribute = dt.bestAttribute(m.monk1, m.attributes)
    for value in best_atribute.values:
        subset = dt.select(m.monk1, best_atribute, value)
        best_atribute2 = dt.bestAttribute(subset, m.attributes)
        print(str(best_atribute) + " = " + str(value))
        for value2 in best_atribute2.values:
            subset2 = dt.select(subset, best_atribute2, value2)
            common = dt.mostCommon(subset2)
            print("  " + str(best_atribute2) + "=" + str(value2) + ": " + str(common))
    
    tree = dt.buildTree(monk1, m.attributes, 2)
    draw.drawTree(tree)