Beispiel #1
0
def calcNextTreeLevel():
    selectedAttribute = m.attributes[4]
    s1 = dtree.select(m.monk1, selectedAttribute, 1)
    s2 = dtree.select(m.monk1, selectedAttribute, 2)
    s3 = dtree.select(m.monk1, selectedAttribute, 3)
    s4 = dtree.select(m.monk1, selectedAttribute, 4)

    # Calculate information gain of subsets
    #ASSIGNMENT3(s1)
    #ASSIGNMENT3(s2)
    #ASSIGNMENT3(s3)
    #ASSIGNMENT3(s4)

    mc1 = dtree.mostCommon(s1)
    mc2 = dtree.mostCommon(s2)
    mc3 = dtree.mostCommon(s3)
    mc4 = dtree.mostCommon(s4)
    #print(mc1)
    #print(mc2)
    #print(mc3)
    #print(mc4)

    tree = dtree.buildTree(m.monk2test, m.attributes)
    print(tree)
    draw.drawTree(tree)
Beispiel #2
0
def caspersky(dataset):
    print("Assignment 3")
    a = d.bestAttribute(dataset, m.attributes)
    branches = []
    for v in a.values:
        s = d.select(dataset, a, v)
        tf = d.mostCommon(s)
        if tf == True:
            branches.append((v, d.TreeLeaf(s)))
        else:
            a2 = d.bestAttribute(s, m.attributes)
            branches2 = []
            for v2 in a2.values:
                s2 = d.select(s, a2, v2)
                branches2.append((v2, d.TreeLeaf(d.mostCommon(s2))))
            branches.append((v, d.TreeNode(a2, dict(branches2), d.mostCommon(s))))
    
    drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
Beispiel #3
0
def buildtree(dataset, remaining_attr, level):

    if level == 2:
        return dtree.TreeLeaf(dtree.mostCommon(dataset))

    max_attr, _ = getMaxGain(dataset, remaining_attr)
    branches_dict = dict([(value, dtree.select(dataset, max_attr, value))
                          for value in max_attr.values])
    _remaining_attr = [a for a in remaining_attr if a != max_attr]

    branches_nodes = {}
    print(max_attr)
    for value, branch_data in branches_dict.items():
        branches_nodes[value] = buildtree(branch_data, _remaining_attr,
                                          level + 1)

    return dtree.TreeNode(max_attr, branches_nodes,
                          dtree.TreeLeaf(dtree.mostCommon(dataset)))
Beispiel #4
0
def find_splits(datasets, attributes, depth):
    print(len(datasets))
    if (depth == 0):
        for i, dset in enumerate(datasets):
            print("Class for attribute value " + str(i + 1) + ": " + str(dtree.mostCommon(dset)))
    else:
        for dset in datasets:
            optimal = optimal_attr_split(dset, attributes)
            find_splits(get_data_subsets(dset, attributes, optimal), attributes, depth - 1)
Beispiel #5
0
Datei: lab1.py Projekt: mkufel/ML
def buildTreeCustom(dataset, depth):
    if (depth > 0):
        bestAttr = dt.bestAttribute(dataset, m.attributes)
        print(str(bestAttr), end='')

        # Select datasets splits for each value of the bestAttr
        splits = []
        for value in bestAttr.values:
            splits.append(dt.select(dataset, bestAttr, value))

        for split in splits:
            # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth
            if (dt.entropy(split) > 0):
                buildTreeCustom(split, depth - 1)
            else:
                print('+' if dt.mostCommon(split) else '-', end='')
    else:
        print('+' if dt.mostCommon(dataset) else '-', end='')
Beispiel #6
0
def makeTree(set, level, attributes):
    if level >= depth:
        return dtree.TreeLeaf(dtree.mostCommon(set))
    attr = dtree.bestAttribute(set, attributes)
    node = []
    branches = []
    for val in attr.values:
        subset = dtree.select(set, attr, val)
        attributes_left = [a for a in attributes if a != attr]
        if dtree.allPositive(subset):
            node = dtree.TreeLeaf(True)
        elif dtree.allNegative(subset):
            node = dtree.TreeLeaf(False)
        else:
            node = makeTree(subset, level + 1, attributes_left)
        branches.append((val, node))
    node = dtree.TreeNode(attr, dict(branches), dtree.mostCommon(set))
    return node
Beispiel #7
0
def getLeaves(dataSet, a1, a2):
    a1_domain = m.attributes[a1].values
    a2_domain = m.attributes[a2].values

    for k in a1_domain:
        x = dtree.select(dataSet, m.attributes[a1], k)
        for l in a2_domain:
            y = dtree.select(x, m.attributes[a2], l)
            z = dtree.mostCommon(y)
            print("For " + str(k) + ":" + str(l) + ", " + "most common = " + str(z))
Beispiel #8
0
def caspersky(dataset):
    print("Assignment 3")
    a = d.bestAttribute(dataset, m.attributes)
    branches = []
    for v in a.values:
        s = d.select(dataset, a, v)
        tf = d.mostCommon(s)
        if tf == True:
            branches.append((v, d.TreeLeaf(s)))
        else:
            a2 = d.bestAttribute(s, m.attributes)
            branches2 = []
            for v2 in a2.values:
                s2 = d.select(s, a2, v2)
                branches2.append((v2, d.TreeLeaf(d.mostCommon(s2))))
            branches.append((v, d.TreeNode(a2, dict(branches2),
                                           d.mostCommon(s))))

    drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
Beispiel #9
0
def PRINT_TREE_AT_LEVEL_2():
    # A5
    print(" ")
    print("LEVEL 1:")
    print(m.attributes[4])
    Att = [None] * 4
    for value in range(1, 5):
        Att[value - 1] = select(m.monk1, m.attributes[4], value)

    print("LEVEL 2:")
    for A in Att:
        tmp = bestAttribute(A, m.attributes)
        print(tmp)
        if tmp == m.attributes[0]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[1]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[2]:
            for value in range(1, 3):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[3]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[4]:
            for value in range(1, 5):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[5]:
            for value in range(1, 3):
                print(mostCommon(select(A, tmp, value)))
    print(" ")
    t = buildTree(m.monk1, m.attributes)
    drawTree(t)
def split(node):
    #splitting
    sub_set_A5_value_1_m1 = d.select(m.monk1, node, 1)
    sub_set_A5_value_not_1_m1 = d.select(m.monk1, node, 2) + d.select(
        m.monk1, node, 3) + d.select(m.monk1, node, 4) + d.select(
            m.monk1, node, 5)

    #calculating gain to figure out which attribute to use in each of the next nodes
    information_gain_left = find_information_gain(sub_set_A5_value_1_m1,
                                                  m.attributes)
    information_gain_right = find_information_gain(sub_set_A5_value_not_1_m1,
                                                   m.attributes)
    information_gain = max(max(information_gain_left),
                           max(information_gain_right))

    #classifying the most common result in each sub tree
    majority_class_left = d.mostCommon(sub_set_A5_value_1_m1)
    majority_class_right = d.mostCommon(sub_set_A5_value_not_1_m1)

    print('left: ', majority_class_left)
    print('right: ', majority_class_right)
    print('information gain: ', information_gain)
def Tree(dataset, attributes, maxdepth=3):
    def Branch(dataset, default, attributes):
        if not dataset:
            return dtree.TreeLeaf(default)
        if dtree.allPositive(dataset):
            return dtree.TreeLeaf(True)
        if dtree.allNegative(dataset):
            return dtree.TreeLeaf(False)
        return Tree(dataset, attributes, maxdepth - 1)

    default = dtree.mostCommon(dataset)
    if maxdepth < 1:
        return dtree.TreeLeaf(default)
    a = dtree.bestAttribute(dataset, attributes)
    attributesLeft = [x for x in attributes if x != a]
    branches = [(v, Branch(dtree.select(dataset, a, v), default,
                           attributesLeft)) for v in a.values]
    return dtree.TreeNode(a, dict(branches), default)
def buildTreeRec(dataset, attributes, depthtodo):
    defaultvalue = d.mostCommon(dataset)
    if d.allPositive(dataset):
        return d.TreeLeaf(True)
    elif d.allNegative(dataset):
        return d.TreeLeaf(False)
    elif (depthtodo <= 0):
        return d.TreeLeaf(defaultvalue)
    else:
        gainziplist = calculateGainTuplesForAllAttributes(dataset, attributes)
        maxgain, maxgainattribute = getTupleWithMaxGainValue(gainziplist)

        subnodes = []
        for attrbutevalue in attributes[maxgainattribute].values:
            newdataset = d.select(dataset, attributes[maxgainattribute], attrbutevalue)
            subnode = buildTreeRec(newdataset, attributes, depthtodo - 1)
            subnodes.append((attrbutevalue, subnode))

        return d.TreeNode(attributes[maxgainattribute], dict(subnodes), defaultvalue)
Beispiel #13
0
def calc_next_level():
  #print "\nAverage gain when a5 is choosen"
  print "\nA5\t  a1\t\t  a2\t\t  a3\t\t  a4\t\t  a5\t\t  a6"
  s = "A5(" 
  for val in data.attributes[4].values:
    subset = dt.select(data.monk1, data.attributes[4], val)
    t = "\t"
    for attr in data.attributes: 
      t = t + "%.6f\t" % (dt.averageGain(subset, attr))
    print val , t
    best = dt.bestAttribute(subset, data.attributes)
    s = s + best.name + "("
    #print "best attribute: ", best.name
    for value in best.values:
      #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value))
      if(dt.mostCommon(dt.select(subset, best, value))): 
        s = s + "+"
      else:
        s = s + "-"
    s = s + ")"
  s = s + ")"
  print "\nOur tree:\t", s
  print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
Beispiel #14
0
print("subset a5_4 IG")
for i in range(6):
    print(d.averageGain(a5_4, m.attributes[i]))

a5_2_a4_1 = d.select(a5_2, m.attributes[3], 1)
a5_2_a4_2 = d.select(a5_2, m.attributes[3], 2)
a5_2_a4_3 = d.select(a5_2, m.attributes[3], 3)

a5_3_a6_1 = d.select(a5_3, m.attributes[5], 1)
a5_3_a6_2 = d.select(a5_3, m.attributes[5], 2)

a5_4_a1_1 = d.select(a5_4, m.attributes[0], 1)
a5_4_a1_2 = d.select(a5_4, m.attributes[0], 2)
a5_4_a1_3 = d.select(a5_4, m.attributes[0], 3)

print(d.mostCommon(a5_1))
print(d.mostCommon(a5_2))
print(d.mostCommon(a5_3))
print(d.mostCommon(a5_4))
print(" ")
print(d.mostCommon(a5_2_a4_1))
print(d.mostCommon(a5_2_a4_2))
print(d.mostCommon(a5_2_a4_3))
print(" ")
print(d.mostCommon(a5_3_a6_1))
print(d.mostCommon(a5_3_a6_2))
print(" ")
print(d.mostCommon(a5_4_a1_1))
print(d.mostCommon(a5_4_a1_2))
print(d.mostCommon(a5_4_a1_3))
Beispiel #15
0
print("3", dt.averageGain(list2, m.attributes[3]))
print("5", dt.averageGain(list2, m.attributes[5]))
print("list3")
print("0", dt.averageGain(list3, m.attributes[0]))
print("1", dt.averageGain(list3, m.attributes[1]))
print("2", dt.averageGain(list3, m.attributes[2]))
print("3", dt.averageGain(list3, m.attributes[3]))
print("5", dt.averageGain(list3, m.attributes[5]))
print("list4")
print("0", dt.averageGain(list4, m.attributes[0]))
print("1", dt.averageGain(list4, m.attributes[1]))
print("2", dt.averageGain(list4, m.attributes[2]))
print("3", dt.averageGain(list4, m.attributes[3]))
print("5", dt.averageGain(list4, m.attributes[5]))

common = dt.mostCommon(dt.select(list4, m.attributes[0], 1))
common2 = dt.mostCommon(dt.select(list4, m.attributes[0], 2))
common3 = dt.mostCommon(dt.select(list4, m.attributes[0], 3))

print("attribute val 1 is mostly ", common)
print("attribute val 2 is mostly ", common2)
print("attribute val 3 is mostly ", common3)

t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes)
#t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes, maxdepth=5)
#draw.drawTree(t1)

t1 = dt.buildTree(dataset=m.monk1, attributes=m.attributes)
t2 = dt.buildTree(dataset=m.monk2, attributes=m.attributes)
t3 = dt.buildTree(dataset=m.monk3, attributes=m.attributes)
Beispiel #16
0
def main():
    # Assignement 1
    print("Assignement 1")
    monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3]
    monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test]
    entropies = [dtree.entropy(monk) for monk in monks]
    print("*** Monk1 entropy: ", entropies[0])
    print("*** Monk2 entropy: ", entropies[1])
    print("*** Monk3 entropy: ", entropies[2])

    # Assignement 3
    print(" ")
    print("Assignement 3")
    attributes = monkdata.attributes
    info_gain1 = info_gain(monks[0], attributes)
    info_gain2 = info_gain(monks[1], attributes)
    info_gain3 = info_gain(monks[2], attributes)
    print("*** Monk1 information gain for attribute:",
          ['%.5f' % x for x in info_gain1])
    print("*** Monk2 information gain for attribute:",
          ['%.5f' % x for x in info_gain2])
    print("*** Monk3 information gain for attribute:",
          ['%.5f' % x for x in info_gain3])

    # Assignement 5
    print("")
    print("Assignement 5")
    print("*** Attribute:",
          np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset")
    print("*** Attribute:",
          np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset")
    print("*** Attribute:",
          np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset")
    print("***")
    max0 = np.argmax(info_gain1)  # attribute of first split
    attributes_left = [
        attrib for attrib in attributes if attrib != attributes[max0]
    ]
    print("*** 1) Attributes the next nodes should be tested on: ",
          attributes_left)

    # Attributes to split on in second step
    splits = [
        np.argmax(
            info_gain(dtree.select(monks[0], attributes[max0], value),
                      attributes)) + 1 for value in attributes[max0].values
    ]
    print("*** 2) Second split is on the attriburtes: ", splits)

    # Decision after second split
    subsets = [
        dtree.select(monks[0], attributes[max0], split) for split in splits
    ]
    print("*** 3) Assignement after second split: ",
          [dtree.mostCommon(subset) for subset in subsets])
    print("***")

    print("*** Train and test set errors")
    t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes)
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1),
          " Etest=", 1 - dtree.check(t1, monkdata.monk1test))
    t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes)
    print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2),
          " Etest=", 1 - dtree.check(t2, monkdata.monk2test))
    t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes)
    print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3),
          " Etest=", 1 - dtree.check(t3, monkdata.monk3test))

    import drawtree_qt5
    #print(t1) # tree in text form(weird)
    #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree

    # Assignement 7
    print("")
    print("Assignement 7")

    # The prunning for the exanple of monk1
    monk1train, monk1val = partition(monkdata.monk1, 0.9)
    t1 = dtree.buildTree(monk1train,
                         monkdata.attributes)  # tree trained from monk1train
    t11 = prune(t1, monk1val)  # prunned tree
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=",
          1 - dtree.check(t1, monkdata.monk1test))
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=",
          1 - dtree.check(t11, monkdata.monk1test))

    # Statistic information for different fraction for monk1 and monk3
    fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

    # Evaluation of Monk1
    eval1 = [
        evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test)
        for frac in fraction
    ]
    means1 = [np.mean(x) for x in eval1]
    vars1 = [np.var(x) for x in eval1]

    plt.figure(1)
    plt.subplot(121)
    plt.plot(fraction, means1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk1')

    # Evaluation of Monk2
    eval3 = [
        evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test)
        for frac in fraction
    ]
    means3 = [np.mean(x) for x in eval3]
    vars3 = [np.var(x) for x in eval3]

    plt.figure(2)
    plt.subplot(121)
    plt.plot(fraction, means3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk2')
    plt.show()
Beispiel #17
0
uncertainty the most. Thus, it should be used for splitting at the root node.
"""


"5 BUILDING DECISION TREES"
sel = []
for i in range(4):  # splits data into subset according to attr a5
    sel.append(t.select(m.monk1, m.attributes[4], m.attributes[4].values[i]))

# print(sel)
sub = []
mC = []
for subset in sel:
    for i in [0, 1, 2, 3, 5]:
        sub.append(t.averageGain(subset, m.attributes[i]))
    mC.append(t.mostCommon(subset))

    # print(sub)
    sub = []

"Highest information gain on second level of the tree # 2 - A4 , 3 - A6 , 4 - A1 #"

"""Assignment 3"""
tree1 = t.buildTree(m.monk1, m.attributes)
tree2 = t.buildTree(m.monk2, m.attributes)
tree3 = t.buildTree(m.monk3, m.attributes)

draw.drawTree(tree1)
# draw.drawTree(tree2)
# draw.drawTree(tree3)
# ATTRIBUTE A2 IN MONK-3
# A2 HAS VALUES {1,2,3}
    
monk3_1=select(m.monk1,m.attributes[1],1) # MONK-3 dataset where a2=1
monk3_2=select(m.monk1,m.attributes[1],2) # MONK-3 dataset where a2=2
monk3_3=select(m.monk1,m.attributes[1],3) # MONK-3 dataset where a2=3


# INFORMATION GAIN CALCULATION AFTER SPLITTING
# FOR MONK-1 SPLITTINGS

info_gain1_1=[0]*6
for i in range(6):
    info_gain1_1[i]=averageGain(monk1_1,m.attributes[i])
    print("Information gain in MONK-1 tree for a5=1 for a{} is {}".format(i+1,info_gain1_1[i]))
m1_1=mostCommon(monk1_1)
print("The most common output in MONK-1 for a5=1 is {}".format(m1_1))

info_gain1_2=[0]*6
for i in range(6):
    info_gain1_2[i]=averageGain(monk1_2,m.attributes[i])
    print("Information gain in MONK-1 tree for a5=2 for a{} is {}".format(i+1,info_gain1_2[i]))
m1_2=mostCommon(monk1_2)
print("The most common output in MONK-1 for a5=2 is {}".format(m1_2))

info_gain1_3=[0]*6
for i in range(6):
    info_gain1_3[i]=averageGain(monk1_3,m.attributes[i])
    print("Information gain in MONK-1 tree for a5=3 for a{} is {}".format(i+1,info_gain1_3[i]))
m1_3=mostCommon(monk1_3)
print("The most common output in MONK-1 for a5=3 is {}".format(m1_3))
Beispiel #19
0
            print("Entropy " + str(value) + ": " + str(entropy))
            # print("Next level information gains:")
            # for i in range(6):
            #     gain = dt.averageGain(monk, m.attributes[i])
            #     print("A" + str(i+1) + ": " + str(gain))
    print("")


    best_atribute = dt.bestAttribute(m.monk1, m.attributes)
    for value in best_atribute.values:
        subset = dt.select(m.monk1, best_atribute, value)
        entropy = dt.entropy(subset)
        print("Attribute value:" + str(value))
        for i in range(6):
            gain = dt.averageGain(subset, m.attributes[i])
            print("A" + str(i+1) + ": " + str(gain))
    print("")

    # Assignment 5
    best_atribute = dt.bestAttribute(m.monk1, m.attributes)
    for value in best_atribute.values:
        subset = dt.select(m.monk1, best_atribute, value)
        best_atribute2 = dt.bestAttribute(subset, m.attributes)
        print(str(best_atribute) + " = " + str(value))
        for value2 in best_atribute2.values:
            subset2 = dt.select(subset, best_atribute2, value2)
            common = dt.mostCommon(subset2)
            print("  " + str(best_atribute2) + "=" + str(value2) + ": " + str(common))
    
    tree = dt.buildTree(monk1, m.attributes, 2)
    draw.drawTree(tree)
Beispiel #20
0
import monkdata as m
import dtree as dtree

foo = dtree.select(m.monk1, m.attributes[4], 3)
print '-- information gain of monk-1 dataset: --'
print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0]))
print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1]))
print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2]))
print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3]))
print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5]))

foo = dtree.select(m.monk1, m.attributes[4], 1)
print '-- is a_5 with value = 1 a majority class? --'
print dtree.mostCommon(foo)
Beispiel #21
0
    print("Average gain in dataset monk2 and attribute " + str(x.name) +
          " is %.6f" % ag2)
    print("Average gain in dataset monk3 and attribute " + str(x.name) +
          " is %.6f" % ag3)

print("\n")

for x in range(1, 5):
    highest_avg = 0
    highest_attribute = 0
    s = dtree.select(mdata.monk1, mdata.attributes[4], x)
    for y in mdata.attributes:
        avg_g = dtree.averageGain(s, y)
        print("Average gain in dataset monk1 and subset s" + str(x) +
              " and attribute " + str(y.name) +
              " is %.6f. Majority: " % avg_g + str(dtree.mostCommon(s)))
        if (avg_g > highest_avg):
            highest_avg = avg_g
            highest_attribute = int(y.name[1])

    print("Highest avg: %.6f in attr: " % highest_avg + str(highest_attribute))
    for z in range(
            1,
            len(mdata.attributes[int(highest_attribute - 1)].values) + 1):
        s2 = dtree.select(s, mdata.attributes[int(highest_attribute - 1)], z)
        print(dtree.mostCommon(s2))

    print("\n")

t1 = dtree.buildTree(mdata.monk1, mdata.attributes)
print("Test data check: %.6f\n" % dtree.check(t1, mdata.monk1test))
import monkdata as m
import dtree as dt
import drawtree as draw

entropy = dt.entropy(m.monk1)
best_gain = 0
for attribute in m.attributes:
    gain = dt.averageGain(m.monk1, attribute)
    if gain > best_gain:
        best_gain = gain
        best_attribute = attribute

for v in best_attribute.values:
    subset = dt.select(m.monk1, best_attribute, v)
    majority_class = dt.mostCommon(subset)

values = {
    v: dt.mostCommon(dt.select(m.monk1, best_attribute, v))
    for v in best_attribute.values
}
print(best_attribute, values)
draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))
def printNumTrueFalse(datasets):
    # For a list of datasets, print the number of true and false
    for i in range(0, len(datasets)):
        print("Monk"+str(i+1)+" "+
              "[#tot="+str(len(datasets[i]))+"] "+
              "[#true="+str(getNumTrue(datasets[i]))+"] "+
              "[#false="+str(getNumFalse(datasets[i]))+"]")
#Main
dataset = m.monk2
available = [True]*len(m.attributes)
firstSplit = getBestAttribute(dataset, m.attributes, available)
print("Firstsplit = "+str(firstSplit))
print("-----")
available[firstSplit] = False
sets = []
for i in range(0, len(m.attributes[firstSplit].values)):
    sets.append(d.select(dataset, m.attributes[firstSplit], m.attributes[firstSplit].values[i]))

for i in range(0, len(sets)):
    subSets = []
    splitOn = getBestAttribute(sets[i], m.attributes, available)
    print("Second split = "+str(splitOn))
    for j in range(0, len(m.attributes[splitOn].values)):
        subSets.append(d.select(sets[i], m.attributes[i], m.attributes[i].values[j]))
    for s in subSets:
        print(d.mostCommon(s))

    print("----")

monkData = {
    'monk1_2': {
        'data': monk1_2,
        'branch': 3
    },
    'monk1_3': {
        'data': monk1_3,
        'branch': 5
    },
    'monk1_4': {
        'data': monk1_4,
        'branch': 0
    }
}

monk1_maj = d.mostCommon(m.monk1)
monk1_1_maj = d.mostCommon(monk1_1)
print('majority class for monk1: ' + str(monk1_maj))
print('majority class for monk1_1: ' + str(monk1_1_maj))

for mo in monkData:
    moName = mo
    attrNo = monkData[mo]['branch']
    monk_maj = d.mostCommon(monkData[mo]['data'])
    print('majority class for ' + mo + ': ' + str(monk_maj))
    for attrVal in m.attributes[attrNo].values:
        monkTmp = d.select(monkData[mo]['data'], m.attributes[attrNo], attrVal)
        monk_maj = d.mostCommon(monkTmp)
        print('majority class for ' + mo + ', partition ' + str(attrVal) +
              ': ' + str(monk_maj))
Beispiel #25
0
import dtree as d
#import drawtree

print("3. Building decision tree: \n")

print("Subset division of MONK-1 at attribute 5: \n")
subsets = [];
for x in range(0, len(m.attributes[4].values)):
    subsets.append(d.select(m.monk1, m.attributes[4], x+1))

for set in subsets:
    gain = 0;
    maxgain = 0;
    bestatr = 0;
    print("Value: %d" % (subsets.index(set) + 1))
    print("Most common: " + str(d.mostCommon(set)))
    for x in range(0, len(m.attributes)):
        gain = d.averageGain(set, m.attributes[x]);
        print("Attribute A%d: %f" % (x+1, gain))
        if(gain > maxgain):
            maxgain = gain;
            bestatr = x;
    print("Attribute with best information gain: A%d \n" % (bestatr + 1));
    maxgain = 0
    bestatr = 0;


print("MONK-1:")
t = d.buildTree(m.monk1, m.attributes)
print("Testing set error %f: " % (1 - d.check(t, m.monk1test)))
print("Training set error %f: \n" % (1 - d.check(t, m.monk1)))
Beispiel #26
0
for i, dataset in enumerate(train):
    print("")
    print("Average gain for monk{} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute)))

monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)]

for i, monk1 in enumerate(monk1a5):
    print("")
    print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        if j != 4:
            print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute)))
    print("Majority class = {}".format(dt.mostCommon(monk1)))


# Building the decision tree.
tree1 = dt.buildTree(monk.monk1, monk.attributes)
tree2 = dt.buildTree(monk.monk2, monk.attributes)
tree3 = dt.buildTree(monk.monk3, monk.attributes)
trees = [tree1, tree2, tree3]

# Drawing the decision tree.
#drawtree.drawTree(tree)

print("")
for i, (dataset1, dataset2, tree) in enumerate(zip(train, test, trees)):
    print("Error for Monk{} on train = {} and on test {}.".format(i, dt.check(tree, dataset1), dt.check(tree, dataset2)))
Beispiel #27
0
# Ass. 5
infoGains2 = numpy.zeros((4, 6))
subsets = [0 for x in range(0, 4)]
for i in range(1, 5):
    subsets[i - 1] = dtree.select(m.monk1, m.attributes[4], i)
    for j in range(0, 6):
        infoGains2[i - 1, j] = dtree.averageGain(subsets[i - 1],
                                                 m.attributes[j])

print("Information gains for each subset:\n", infoGains2)

# subset[1]
subsets1 = [0 for x in range(0, 3)]  #a5==2
for i in range(1, 4):
    subsets1[i - 1] = dtree.select(m.monk1, m.attributes[3], i)
    print(dtree.mostCommon(subsets1[i - 1]))

subsets2 = [0 for x in range(0, 2)]
for i in range(1, 3):
    subsets2[i - 1] = dtree.select(m.monk1, m.attributes[5], i)
    print(dtree.mostCommon(subsets2[i - 1]))

subsets3 = [0 for x in range(0, 3)]
for i in range(1, 4):
    subsets3[i - 1] = dtree.select(m.monk1, m.attributes[0], i)
    print(dtree.mostCommon(subsets3[i - 1]))

print(dtree.buildTree(m.monk1, m.attributes, 2))  # problem! inconsistent
#drawtree_qt5.drawTree(dtree.buildTree(m.monk1, m.attributes, 2))

t = dtree.buildTree(m.monk1, m.attributes)
print "Gain Monk1 a5(3) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[0]))
print "Gain Monk1 a5(3) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[1]))
print "Gain Monk1 a5(3) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[2]))
print "Gain Monk1 a5(3) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[3]))
print "Gain Monk1 a5(3) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[4]))
print "Gain Monk1 a5(3) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[5]))

print "Gain Monk1 a5(4) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[0]))
print "Gain Monk1 a5(4) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[1]))
print "Gain Monk1 a5(4) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[2]))
print "Gain Monk1 a5(4) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[3]))
print "Gain Monk1 a5(4) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[4]))
print "Gain Monk1 a5(4) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 4),m.attributes[5]))

selec1 = tree.select(m.monk1, m.attributes[4], 4)
print "Most Common Level2 Monk1(1): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],1)))
print "Most Common Level2 Monk1(2): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],2)))
print "Most Common Level2 Monk1(3): " + str(tree.mostCommon(tree.select(selec1,m.attributes[1],3)))

print "Monk 1 Etrain : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1))
print "Monk 1 Etest  : " + str(tree.check(tree.buildTree(m.monk1, m.attributes), m.monk1test))
print "Monk 2 Etrain : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2))
print "Monk 2 Etest  : " + str(tree.check(tree.buildTree(m.monk2, m.attributes), m.monk2test))
print "Monk 3 Etrain : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3))
print "Monk 3 Etest  : " + str(tree.check(tree.buildTree(m.monk3, m.attributes), m.monk3test))

print "ID3 built tree : \n"
tree1 = tree.buildTree(m.monk1,m.attributes,2)
#d.drawTree(tree1)

#x = [0.3,0.4,0.5,0.6,0.7,0.8]
Beispiel #29
0
for x in range(0, 6):
  gain_partition1.append(dt.averageGain(partition1,m.attributes[x]))
  gain_partition2.append(dt.averageGain(partition2,m.attributes[x]))
  gain_partition3.append(dt.averageGain(partition3,m.attributes[x]))
  gain_partition4.append(dt.averageGain(partition4,m.attributes[x]))

print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6"
print "Part 1: ","\t".join(["%.7f"%y for y in gain_partition1])
print "Part 2: ","\t".join(["%.7f"%y for y in gain_partition2])
print "Part 3: ","\t".join(["%.7f"%y for y in gain_partition3])
print "Part 4: ","\t".join(["%.7f"%y for y in gain_partition4])

print
print "Own tree"
print "A5(",dt.mostCommon(partition1),"A4(",dt.mostCommon(partition2),")","A6",dt.mostCommon(partition3),")","A1(",dt.mostCommon(partition4), "))" 

print
print "BuildTree function"
print dt.buildTree(m.monk1,m.attributes,2)
#draw.drawTree(dt.buildTree(m.monk1,m.attributes,2))


print
print "Building Trees"
t1 = dt.buildTree(m.monk1,m.attributes)
t2 = dt.buildTree(m.monk2,m.attributes)
t3 = dt.buildTree(m.monk3,m.attributes)
print "Checking Full Tree"
print "Dataset\tE train\t\tE test"
print "Monk1\t","%.7f"%dt.check(t1,m.monk1), "\t%.7f"%dt.check(t1,m.monk1test)
Beispiel #30
0
print("monk-2: %f %f %f %f %f %f" % (
    d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]),
    d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]),
    d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5])
))

print("monk-3: %f %f %f %f %f %f" % (
    d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]),
    d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]),
    d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5])
))

monk1_subset = d.select(m.monk1, m.attributes[4], 3)

print len(monk1_subset)
print(d.mostCommon(monk1_subset))
monk1_subset_tree = d.buildTree(monk1_subset, m.attributes, 5)
print(monk1_subset_tree)

t1 = d.buildTree(m.monk1, m.attributes);
print(d.check(t1, m.monk1test))
print(d.check(t1, m.monk1))

t2 = d.buildTree(m.monk2, m.attributes);
print(d.check(t2, m.monk2test))
print(d.check(t2, m.monk2))

t3 = d.buildTree(m.monk3, m.attributes);
print(d.check(t3, m.monk3test))
print(d.check(t3, m.monk3))
Beispiel #31
0
def mostCommonAfterSplit(dataset, attributeNumber):
	return [dtree.mostCommon(x) for x in splitDataset(dataset, attributeNumber)]
Beispiel #32
0
print("Best attribute for split:", max(gains, key=gains.get))
print()

gains = dict(
    zip(md.attributes, [dt.averageGain(md.monk3, a) for a in md.attributes]))
print("Information gains MONK-3:", gains)
print("Best attribute for split:", max(gains, key=gains.get))
print()

# building tree
print("\n----------DECISION TREE MONK-1 DEPTH 2----------\n")
for v in selected.values:
    print(selected, "=", v)
    subset = dt.select(md.monk1, selected, v)
    if dt.allPositive(subset) or dt.allNegative(subset):
        print(selected, "=", v, "->", dt.mostCommon(subset))
    else:
        attributes_left = [a for a in md.attributes if a != selected]
        gains = dict(
            zip(attributes_left,
                [dt.averageGain(subset, a) for a in attributes_left]))
        print("Information gains:", gains)
        best = max(gains, key=gains.get)
        print("Best attribute for split:", best)
        for v2 in best.values:
            print(best, "=", v2, "->",
                  dt.mostCommon(dt.select(subset, best, v2)))
    print()
# dr.drawTree(dt.buildTree(md.monk1, md.attributes, 2))
print(dt.buildTree(md.monk1, md.attributes, 2))
import monkdata as m
import dtree as dt
import drawtree as draw

entropy = dt.entropy(m.monk1)
best_gain = 0
for attribute in m.attributes:
    gain = dt.averageGain(m.monk1, attribute)
    if gain > best_gain:
        best_gain = gain
        best_attribute = attribute


for v in best_attribute.values:
    subset = dt.select(m.monk1, best_attribute, v)
    majority_class = dt.mostCommon(subset)

values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values}
print(best_attribute, values)
draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))
Beispiel #34
0
    for i in range(0, len(datasets)):
        print("Monk" + str(i + 1) + " " + "[#tot=" + str(len(datasets[i])) +
              "] " + "[#true=" + str(getNumTrue(datasets[i])) + "] " +
              "[#false=" + str(getNumFalse(datasets[i])) + "]")


#Main
dataset = m.monk2
available = [True] * len(m.attributes)
firstSplit = getBestAttribute(dataset, m.attributes, available)
print("Firstsplit = " + str(firstSplit))
print("-----")
available[firstSplit] = False
sets = []
for i in range(0, len(m.attributes[firstSplit].values)):
    sets.append(
        d.select(dataset, m.attributes[firstSplit],
                 m.attributes[firstSplit].values[i]))

for i in range(0, len(sets)):
    subSets = []
    splitOn = getBestAttribute(sets[i], m.attributes, available)
    print("Second split = " + str(splitOn))
    for j in range(0, len(m.attributes[splitOn].values)):
        subSets.append(
            d.select(sets[i], m.attributes[i], m.attributes[i].values[j]))
    for s in subSets:
        print(d.mostCommon(s))

    print("----")