Example #1
0
def calcNextTreeLevel():
    selectedAttribute = m.attributes[4]
    s1 = dtree.select(m.monk1, selectedAttribute, 1)
    s2 = dtree.select(m.monk1, selectedAttribute, 2)
    s3 = dtree.select(m.monk1, selectedAttribute, 3)
    s4 = dtree.select(m.monk1, selectedAttribute, 4)

    # Calculate information gain of subsets
    #ASSIGNMENT3(s1)
    #ASSIGNMENT3(s2)
    #ASSIGNMENT3(s3)
    #ASSIGNMENT3(s4)

    mc1 = dtree.mostCommon(s1)
    mc2 = dtree.mostCommon(s2)
    mc3 = dtree.mostCommon(s3)
    mc4 = dtree.mostCommon(s4)
    #print(mc1)
    #print(mc2)
    #print(mc3)
    #print(mc4)

    tree = dtree.buildTree(m.monk2test, m.attributes)
    print(tree)
    draw.drawTree(tree)
Example #2
0
def getLeaves(dataSet, a1, a2):
    a1_domain = m.attributes[a1].values
    a2_domain = m.attributes[a2].values

    for k in a1_domain:
        x = dtree.select(dataSet, m.attributes[a1], k)
        for l in a2_domain:
            y = dtree.select(x, m.attributes[a2], l)
            z = dtree.mostCommon(y)
            print("For " + str(k) + ":" + str(l) + ", " + "most common = " + str(z))
Example #3
0
def getSubsets(set, n):
    values = m.attributes[n].values
    subsets = []
    for val in values:
        subsets.append(dtree.select(m.monk1, m.attributes[4], val))
        # print(dtree.select(m.monk1, m.attributes[4], val))
    return subsets
Example #4
0
def entropy_matrix(datasets, attribute_index, max_att_list):
    entropy_matrix = np.zeros(
        (len(datasets), len(m.attributes[attribute_index].values)))
    for idx, dataset in enumerate(datasets):
        att = m.attributes[max_att_list[idx]]
        for j, v in enumerate(att.values):
            entropy_matrix[idx, j] = d.entropy(d.select(dataset, att, v))
    print(entropy_matrix)
Example #5
0
def split_tree_by_attribute_and_value(dataset, attribute_idx):
    attribute_values = m.attributes[attribute_idx].values
    attribute_values_list = [[i] for i in list(attribute_values)]
    dataset_by_attribute_and_value = []
    for value in attribute_values:
        dataset_by_attribute_and_value.append(
            d.select(dataset, m.attributes[attribute_idx], value))
    return dataset_by_attribute_and_value, attribute_values_list
Example #6
0
def get_data_subsets(data, attributes, split_attribute):
    subsets = []
    attr = mdata.attributes[split_attribute] # extract the key for the given attribute
    for val in attributes[split_attribute].values:
        subsets.append(dtree.select(data, attr, val))
    
    print("Subset sizes: " + str(map(len, subsets)))
    return subsets
Example #7
0
def PRINT_TREE_AT_LEVEL_2():
    # A5
    print(" ")
    print("LEVEL 1:")
    print(m.attributes[4])
    Att = [None] * 4
    for value in range(1, 5):
        Att[value - 1] = select(m.monk1, m.attributes[4], value)

    print("LEVEL 2:")
    for A in Att:
        tmp = bestAttribute(A, m.attributes)
        print(tmp)
        if tmp == m.attributes[0]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[1]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[2]:
            for value in range(1, 3):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[3]:
            for value in range(1, 4):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[4]:
            for value in range(1, 5):
                print(mostCommon(select(A, tmp, value)))
        if tmp == m.attributes[5]:
            for value in range(1, 3):
                print(mostCommon(select(A, tmp, value)))
    print(" ")
    t = buildTree(m.monk1, m.attributes)
    drawTree(t)
Example #8
0
def splitOnA5AndComputeInformationGainsOfSubsets():
    """ Assignment 3: Split on attribute 5 (A5) and compute the gains of the subsets. """
    a5 = m.attributes[4]

    for set in trainingSets:
        for attributeValue in a5.values:
            subset = d.select(set.dataset, a5, attributeValue)
            printInformationGainOfDataset(
                subset, set.name + " splitted on A5 = " + str(attributeValue))
Example #9
0
def caspersky(dataset):
    print("Assignment 3")
    a = d.bestAttribute(dataset, m.attributes)
    branches = []
    for v in a.values:
        s = d.select(dataset, a, v)
        tf = d.mostCommon(s)
        if tf == True:
            branches.append((v, d.TreeLeaf(s)))
        else:
            a2 = d.bestAttribute(s, m.attributes)
            branches2 = []
            for v2 in a2.values:
                s2 = d.select(s, a2, v2)
                branches2.append((v2, d.TreeLeaf(d.mostCommon(s2))))
            branches.append((v, d.TreeNode(a2, dict(branches2), d.mostCommon(s))))
    
    drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
Example #10
0
def caspersky(dataset):
    print("Assignment 3")
    a = d.bestAttribute(dataset, m.attributes)
    branches = []
    for v in a.values:
        s = d.select(dataset, a, v)
        tf = d.mostCommon(s)
        if tf == True:
            branches.append((v, d.TreeLeaf(s)))
        else:
            a2 = d.bestAttribute(s, m.attributes)
            branches2 = []
            for v2 in a2.values:
                s2 = d.select(s, a2, v2)
                branches2.append((v2, d.TreeLeaf(d.mostCommon(s2))))
            branches.append((v, d.TreeNode(a2, dict(branches2),
                                           d.mostCommon(s))))

    drawtree.drawTree(d.TreeNode(a, dict(branches), d.mostCommon(dataset)))
Example #11
0
def split(node):
    #splitting
    sub_set_A5_value_1_m1 = d.select(m.monk1, node, 1)
    sub_set_A5_value_not_1_m1 = d.select(m.monk1, node, 2) + d.select(
        m.monk1, node, 3) + d.select(m.monk1, node, 4) + d.select(
            m.monk1, node, 5)

    #calculating gain to figure out which attribute to use in each of the next nodes
    information_gain_left = find_information_gain(sub_set_A5_value_1_m1,
                                                  m.attributes)
    information_gain_right = find_information_gain(sub_set_A5_value_not_1_m1,
                                                   m.attributes)
    information_gain = max(max(information_gain_left),
                           max(information_gain_right))

    #classifying the most common result in each sub tree
    majority_class_left = d.mostCommon(sub_set_A5_value_1_m1)
    majority_class_right = d.mostCommon(sub_set_A5_value_not_1_m1)

    print('left: ', majority_class_left)
    print('right: ', majority_class_right)
    print('information gain: ', information_gain)
Example #12
0
def buildTree(subset,attrs):
	global tree
	if isLeaf(subset):
		tree = (tree + '+') if d.allPositive(subset) else (tree + '-')
		return
	else:
		root = d.bestAttribute(subset,attrs)
		tree = tree + str(root) + "("
		for value in root.values:
			nextSubset = d.select(subset,root,value)
			nextAttrs = attrs - set([root])
			buildTree(nextSubset,nextAttrs)
		tree = tree + ")"
Example #13
0
def calc_next_level():
  #print "\nAverage gain when a5 is choosen"
  print "\nA5\t  a1\t\t  a2\t\t  a3\t\t  a4\t\t  a5\t\t  a6"
  s = "A5(" 
  for val in data.attributes[4].values:
    subset = dt.select(data.monk1, data.attributes[4], val)
    t = "\t"
    for attr in data.attributes: 
      t = t + "%.6f\t" % (dt.averageGain(subset, attr))
    print val , t
    best = dt.bestAttribute(subset, data.attributes)
    s = s + best.name + "("
    #print "best attribute: ", best.name
    for value in best.values:
      #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value))
      if(dt.mostCommon(dt.select(subset, best, value))): 
        s = s + "+"
      else:
        s = s + "-"
    s = s + ")"
  s = s + ")"
  print "\nOur tree:\t", s
  print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
Example #14
0
File: lab1.py Project: mkufel/ML
def buildTreeCustom(dataset, depth):
    if (depth > 0):
        bestAttr = dt.bestAttribute(dataset, m.attributes)
        print(str(bestAttr), end='')

        # Select datasets splits for each value of the bestAttr
        splits = []
        for value in bestAttr.values:
            splits.append(dt.select(dataset, bestAttr, value))

        for split in splits:
            # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth
            if (dt.entropy(split) > 0):
                buildTreeCustom(split, depth - 1)
            else:
                print('+' if dt.mostCommon(split) else '-', end='')
    else:
        print('+' if dt.mostCommon(dataset) else '-', end='')
def Tree(dataset, attributes, maxdepth=3):
    def Branch(dataset, default, attributes):
        if not dataset:
            return dtree.TreeLeaf(default)
        if dtree.allPositive(dataset):
            return dtree.TreeLeaf(True)
        if dtree.allNegative(dataset):
            return dtree.TreeLeaf(False)
        return Tree(dataset, attributes, maxdepth - 1)

    default = dtree.mostCommon(dataset)
    if maxdepth < 1:
        return dtree.TreeLeaf(default)
    a = dtree.bestAttribute(dataset, attributes)
    attributesLeft = [x for x in attributes if x != a]
    branches = [(v, Branch(dtree.select(dataset, a, v), default,
                           attributesLeft)) for v in a.values]
    return dtree.TreeNode(a, dict(branches), default)
Example #16
0
def makeTree(set, level, attributes):
    if level >= depth:
        return dtree.TreeLeaf(dtree.mostCommon(set))
    attr = dtree.bestAttribute(set, attributes)
    node = []
    branches = []
    for val in attr.values:
        subset = dtree.select(set, attr, val)
        attributes_left = [a for a in attributes if a != attr]
        if dtree.allPositive(subset):
            node = dtree.TreeLeaf(True)
        elif dtree.allNegative(subset):
            node = dtree.TreeLeaf(False)
        else:
            node = makeTree(subset, level + 1, attributes_left)
        branches.append((val, node))
    node = dtree.TreeNode(attr, dict(branches), dtree.mostCommon(set))
    return node
Example #17
0
def buildtree(dataset, remaining_attr, level):

    if level == 2:
        return dtree.TreeLeaf(dtree.mostCommon(dataset))

    max_attr, _ = getMaxGain(dataset, remaining_attr)
    branches_dict = dict([(value, dtree.select(dataset, max_attr, value))
                          for value in max_attr.values])
    _remaining_attr = [a for a in remaining_attr if a != max_attr]

    branches_nodes = {}
    print(max_attr)
    for value, branch_data in branches_dict.items():
        branches_nodes[value] = buildtree(branch_data, _remaining_attr,
                                          level + 1)

    return dtree.TreeNode(max_attr, branches_nodes,
                          dtree.TreeLeaf(dtree.mostCommon(dataset)))
def buildTreeRec(dataset, attributes, depthtodo):
    defaultvalue = d.mostCommon(dataset)
    if d.allPositive(dataset):
        return d.TreeLeaf(True)
    elif d.allNegative(dataset):
        return d.TreeLeaf(False)
    elif (depthtodo <= 0):
        return d.TreeLeaf(defaultvalue)
    else:
        gainziplist = calculateGainTuplesForAllAttributes(dataset, attributes)
        maxgain, maxgainattribute = getTupleWithMaxGainValue(gainziplist)

        subnodes = []
        for attrbutevalue in attributes[maxgainattribute].values:
            newdataset = d.select(dataset, attributes[maxgainattribute], attrbutevalue)
            subnode = buildTreeRec(newdataset, attributes, depthtodo - 1)
            subnodes.append((attrbutevalue, subnode))

        return d.TreeNode(attributes[maxgainattribute], dict(subnodes), defaultvalue)
Example #19
0
def assignment4():
    datasets = [
        (m.monk1, 'monk1', m.attributes[0]),
        (m.monk1, 'monk1', m.attributes[1]),
        (m.monk1, 'monk1', m.attributes[2]),
        (m.monk1, 'monk1', m.attributes[3]),
        (m.monk1, 'monk1 max', m.attributes[4]),
    ]

    for data, name, attribute in datasets:
        summ = 0
        for value in attribute.values:
            subset = dtree.select(data, attribute, value)

            print(f'Entropy of S{value} for {name}:\t{dtree.entropy(subset)}')

            summ += len(subset) / len(data) * dtree.entropy(subset)

        print(dtree.entropy(data) - summ)
        print()
print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0]))
print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1]))
print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2]))
print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3]))
print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5]))

print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0]))
print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1]))
print "Gain Monk3 a3: " + str(tree.averageGain(m.monk3,m.attributes[2]))
print "Gain Monk3 a4: " + str(tree.averageGain(m.monk3,m.attributes[3]))
print "Gain Monk3 a5: " + str(tree.averageGain(m.monk3,m.attributes[4]))
print "Gain Monk3 a6: " + str(tree.averageGain(m.monk3,m.attributes[5]))

print "Gain Monk1 a5(1) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[0]))
print "Gain Monk1 a5(1) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[1]))
print "Gain Monk1 a5(1) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[2]))
print "Gain Monk1 a5(1) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[3]))
print "Gain Monk1 a5(1) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[4]))
print "Gain Monk1 a5(1) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[5]))

print "Gain Monk1 a5(2) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[0]))
print "Gain Monk1 a5(2) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[1]))
print "Gain Monk1 a5(2) - a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[2]))
print "Gain Monk1 a5(2) - a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[3]))
print "Gain Monk1 a5(2) - a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[4]))
print "Gain Monk1 a5(2) - a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[5]))

print "Gain Monk1 a5(3) - a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[0]))
print "Gain Monk1 a5(3) - a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 3),m.attributes[1]))
Example #21
0
print(d.check(t, m.monk1))
print(d.check(t, m.monk1test))

t = d.buildTree(m.monk2, m.attributes)
print(d.check(t, m.monk2))
print(d.check(t, m.monk2test))

t = d.buildTree(m.monk3, m.attributes)
print(d.check(t, m.monk3))
print(d.check(t, m.monk3test))

print("First Node IG")
for i in range(0, 6):
    print(d.averageGain(m.monk1, m.attributes[i]))

a5_1 = d.select(m.monk1, m.attributes[4], 1)
a5_2 = d.select(m.monk1, m.attributes[4], 2)
a5_3 = d.select(m.monk1, m.attributes[4], 3)
a5_4 = d.select(m.monk1, m.attributes[4], 4)

print("subset a5_1 IG")
for i in range(6):
    print(d.averageGain(a5_1, m.attributes[i]))

print("subset a5_2 IG")
for i in range(6):
    print(d.averageGain(a5_2, m.attributes[i]))

print("subset a5_3 IG")
for i in range(6):
    print(d.averageGain(a5_3, m.attributes[i]))
Example #22
0
print("0", dt.averageGain(m.monk2, m.attributes[0]))
print("1", dt.averageGain(m.monk2, m.attributes[1]))
print("2", dt.averageGain(m.monk2, m.attributes[2]))
print("3", dt.averageGain(m.monk2, m.attributes[3]))
print("4", dt.averageGain(m.monk2, m.attributes[4]))
print("5", dt.averageGain(m.monk2, m.attributes[5]))

print("monk3")
print("0", dt.averageGain(m.monk3, m.attributes[0]))
print("1", dt.averageGain(m.monk3, m.attributes[1]))
print("2", dt.averageGain(m.monk3, m.attributes[2]))
print("3", dt.averageGain(m.monk3, m.attributes[3]))
print("4", dt.averageGain(m.monk3, m.attributes[4]))
print("5", dt.averageGain(m.monk3, m.attributes[5]))

list1 = dt.select(m.monk1, m.attributes[4], 1)
list2 = dt.select(m.monk1, m.attributes[4], 2)
list3 = dt.select(m.monk1, m.attributes[4], 3)
list4 = dt.select(m.monk1, m.attributes[4], 4)

print("gains")
print("list1")
print("0", dt.averageGain(list1, m.attributes[0]))
print("1", dt.averageGain(list1, m.attributes[1]))
print("2", dt.averageGain(list1, m.attributes[2]))
print("3", dt.averageGain(list1, m.attributes[3]))
print("5", dt.averageGain(list1, m.attributes[5]))
print("list2")
print("0", dt.averageGain(list2, m.attributes[0]))
print("1", dt.averageGain(list2, m.attributes[1]))
print("2", dt.averageGain(list2, m.attributes[2]))
Example #23
0
def main():
    # Assignement 1
    print("Assignement 1")
    monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3]
    monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test]
    entropies = [dtree.entropy(monk) for monk in monks]
    print("*** Monk1 entropy: ", entropies[0])
    print("*** Monk2 entropy: ", entropies[1])
    print("*** Monk3 entropy: ", entropies[2])

    # Assignement 3
    print(" ")
    print("Assignement 3")
    attributes = monkdata.attributes
    info_gain1 = info_gain(monks[0], attributes)
    info_gain2 = info_gain(monks[1], attributes)
    info_gain3 = info_gain(monks[2], attributes)
    print("*** Monk1 information gain for attribute:",
          ['%.5f' % x for x in info_gain1])
    print("*** Monk2 information gain for attribute:",
          ['%.5f' % x for x in info_gain2])
    print("*** Monk3 information gain for attribute:",
          ['%.5f' % x for x in info_gain3])

    # Assignement 5
    print("")
    print("Assignement 5")
    print("*** Attribute:",
          np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset")
    print("*** Attribute:",
          np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset")
    print("*** Attribute:",
          np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset")
    print("***")
    max0 = np.argmax(info_gain1)  # attribute of first split
    attributes_left = [
        attrib for attrib in attributes if attrib != attributes[max0]
    ]
    print("*** 1) Attributes the next nodes should be tested on: ",
          attributes_left)

    # Attributes to split on in second step
    splits = [
        np.argmax(
            info_gain(dtree.select(monks[0], attributes[max0], value),
                      attributes)) + 1 for value in attributes[max0].values
    ]
    print("*** 2) Second split is on the attriburtes: ", splits)

    # Decision after second split
    subsets = [
        dtree.select(monks[0], attributes[max0], split) for split in splits
    ]
    print("*** 3) Assignement after second split: ",
          [dtree.mostCommon(subset) for subset in subsets])
    print("***")

    print("*** Train and test set errors")
    t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes)
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1),
          " Etest=", 1 - dtree.check(t1, monkdata.monk1test))
    t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes)
    print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2),
          " Etest=", 1 - dtree.check(t2, monkdata.monk2test))
    t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes)
    print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3),
          " Etest=", 1 - dtree.check(t3, monkdata.monk3test))

    import drawtree_qt5
    #print(t1) # tree in text form(weird)
    #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree

    # Assignement 7
    print("")
    print("Assignement 7")

    # The prunning for the exanple of monk1
    monk1train, monk1val = partition(monkdata.monk1, 0.9)
    t1 = dtree.buildTree(monk1train,
                         monkdata.attributes)  # tree trained from monk1train
    t11 = prune(t1, monk1val)  # prunned tree
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=",
          1 - dtree.check(t1, monkdata.monk1test))
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=",
          1 - dtree.check(t11, monkdata.monk1test))

    # Statistic information for different fraction for monk1 and monk3
    fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

    # Evaluation of Monk1
    eval1 = [
        evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test)
        for frac in fraction
    ]
    means1 = [np.mean(x) for x in eval1]
    vars1 = [np.var(x) for x in eval1]

    plt.figure(1)
    plt.subplot(121)
    plt.plot(fraction, means1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk1')

    # Evaluation of Monk2
    eval3 = [
        evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test)
        for frac in fraction
    ]
    means3 = [np.mean(x) for x in eval3]
    vars3 = [np.var(x) for x in eval3]

    plt.figure(2)
    plt.subplot(121)
    plt.plot(fraction, means3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk2')
    plt.show()
Example #24
0
    ag1 = dtree.averageGain(mdata.monk1, x)
    ag2 = dtree.averageGain(mdata.monk2, x)
    ag3 = dtree.averageGain(mdata.monk3, x)
    print('Average gain in dataset monk1 and attribute ' + str(x.name) +
          ' is %.6f' % ag1)
    print("Average gain in dataset monk2 and attribute " + str(x.name) +
          " is %.6f" % ag2)
    print("Average gain in dataset monk3 and attribute " + str(x.name) +
          " is %.6f" % ag3)

print("\n")

for x in range(1, 5):
    highest_avg = 0
    highest_attribute = 0
    s = dtree.select(mdata.monk1, mdata.attributes[4], x)
    for y in mdata.attributes:
        avg_g = dtree.averageGain(s, y)
        print("Average gain in dataset monk1 and subset s" + str(x) +
              " and attribute " + str(y.name) +
              " is %.6f. Majority: " % avg_g + str(dtree.mostCommon(s)))
        if (avg_g > highest_avg):
            highest_avg = avg_g
            highest_attribute = int(y.name[1])

    print("Highest avg: %.6f in attr: " % highest_avg + str(highest_attribute))
    for z in range(
            1,
            len(mdata.attributes[int(highest_attribute - 1)].values) + 1):
        s2 = dtree.select(s, mdata.attributes[int(highest_attribute - 1)], z)
        print(dtree.mostCommon(s2))
Example #25
0
# Datasets
train = [monk.monk1, monk.monk2, monk.monk3]
test = [monk.monk1test, monk.monk2test, monk.monk3test]

print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1)))
print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2)))
print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3)))

for i, dataset in enumerate(train):
    print("")
    print("Average gain for monk{} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute)))

monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)]

for i, monk1 in enumerate(monk1a5):
    print("")
    print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        if j != 4:
            print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute)))
    print("Majority class = {}".format(dt.mostCommon(monk1)))


# Building the decision tree.
tree1 = dt.buildTree(monk.monk1, monk.attributes)
tree2 = dt.buildTree(monk.monk2, monk.attributes)
tree3 = dt.buildTree(monk.monk3, monk.attributes)
trees = [tree1, tree2, tree3]
Example #26
0
def bestAttribute(dataset, attributes):
    result = 0
    best = attributes[0]
    for a in attributes:
        value = dt.averageGain(dataset, a)
        if value > result:
            result = value
            best = a
    return best


#splitting the data
a = bestAttribute(m.monk1, m.attributes)
data = []
for v in a.values:
    data.append(dt.select(m.monk1, a, v))

#calculating the average information gain for the next level
for d in data:
    for a in m.attributes:
        print dt.averageGain(d, a)
    print '\n'
print '\n' 

#comparison with the tree from the predefined function
tree = dt.buildTree(m.monk1, m.attributes, 2)
#draw.drawTree(tree)


#building the trees for all the monks datasets
#assignment 3
Example #27
0
info_gain_m2 = []
info_gain_m3 = []
attribute = []

#starting counter
i = 0;
#iterating over all the test sets
for sets in [info_gain_m1, info_gain_m2, info_gain_m3]:

    #for all attributes in the sets, the average information gain is added to the list
    for k in range(6):
        attribute.append(dtree.averageGain(data_sets[i], m.attributes[k]));
    sets.append(attribute)

    attribute = []
    i += 1;
    
#print(info_gain_m1)
#print(info_gain_m2)
#print(info_gain_m3)

# Assignment 3 #
################

selected = dtree.select(m.monk1, m.attributes[4], 1)

t=dtree.buildTree(m.monk1, m.attributes);
print(dtree.check(t, m.monk1test))

print(t)
Example #28
0
    d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5])
))

print("monk-2: %f %f %f %f %f %f" % (
    d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]),
    d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]),
    d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5])
))

print("monk-3: %f %f %f %f %f %f" % (
    d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]),
    d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]),
    d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5])
))

monk1_subset = d.select(m.monk1, m.attributes[4], 3)

print len(monk1_subset)
print(d.mostCommon(monk1_subset))
monk1_subset_tree = d.buildTree(monk1_subset, m.attributes, 5)
print(monk1_subset_tree)

t1 = d.buildTree(m.monk1, m.attributes);
print(d.check(t1, m.monk1test))
print(d.check(t1, m.monk1))

t2 = d.buildTree(m.monk2, m.attributes);
print(d.check(t2, m.monk2test))
print(d.check(t2, m.monk2))

t3 = d.buildTree(m.monk3, m.attributes);
Example #29
0
def splitDataset(dataset, attributeNumber):
	"Function to split an entire dataset on attributes"
	return [dtree.select (dataset, m.attributes[attributeNumber], x) for x in m.attributes[attributeNumber].values]
def splitOnAttribute(dataset, attribute, doneSplits):
    sets = []
    for i in range(0, len(attribute.values)):
        sets.append(d.select(dataset, attribute, attribute.values[i]))
    return sets
Example #31
0
print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0]))
print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1]))
print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2]))
print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3]))
print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5]))

print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0]))
print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1]))
print "Gain Monk3 a3: " + str(tree.averageGain(m.monk3,m.attributes[2]))
print "Gain Monk3 a4: " + str(tree.averageGain(m.monk3,m.attributes[3]))
print "Gain Monk3 a5: " + str(tree.averageGain(m.monk3,m.attributes[4]))
print "Gain Monk3 a6: " + str(tree.averageGain(m.monk3,m.attributes[5]))

print "Gain Level1 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 1),m.attributes[0]))
print "Gain Level1 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 1),m.attributes[1]))
print "Gain Level1 Monk1 a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[2], 1),m.attributes[2]))
print "Gain Level1 Monk1 a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[3], 1),m.attributes[3]))
print "Gain Level1 Monk1 a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 1),m.attributes[4]))
print "Gain Level1 Monk1 a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[5], 1),m.attributes[5]))

print "Gain Level2 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 2),m.attributes[0]))
print "Gain Level2 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 2),m.attributes[1]))
print "Gain Level2 Monk1 a3: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[2], 2),m.attributes[2]))
print "Gain Level2 Monk1 a4: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[3], 2),m.attributes[3]))
print "Gain Level2 Monk1 a5: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[4], 2),m.attributes[4]))
print "Gain Level2 Monk1 a6: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[5], 2),m.attributes[5]))

print "Gain Level3 Monk1 a1: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], 3),m.attributes[0]))
print "Gain Level3 Monk1 a2: " + str(tree.averageGain(tree.select(m.monk1, m.attributes[1], 3),m.attributes[1]))
Example #32
0
print("Monk3, attribute a3 has information gain: ",
      dt.averageGain(m.monk3, m.attributes[2]))
print("Monk3, attribute a4 has information gain: ",
      dt.averageGain(m.monk3, m.attributes[3]))
print("Monk3, attribute a5 has information gain: ",
      dt.averageGain(m.monk3, m.attributes[4]))
print("Monk3, attribute a6 has information gain: ",
      dt.averageGain(m.monk3, m.attributes[5]))
print("Monk3's best attribute is: ", dt.bestAttribute(m.monk3, m.attributes))
print("\n")

#Calculate information gain for 2nd level in tree
#Monk1 - a5 - 1
a5 = m.attributes[4]
print("Monk1 - a5 - 1, a1 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[0]))
print("Monk1 - a5 - 1, a2 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[1]))
print("Monk1 - a5 - 1, a3 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[2]))
print("Monk1 - a5 - 1, a4 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[3]))
print("Monk1 - a5 - 1, a6 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[0]), m.attributes[5]))
print("\n")

#Monk1 - a5 - 2
print("Monk1 - a5 - 2, a1 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[1]), m.attributes[0]))
print("Monk1 - a5 - 2, a2 has info gain: ",
      dt.averageGain(dt.select(m.monk1, a5, a5.values[1]), m.attributes[1]))
Example #33
0
  gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x]))
  gain_monk2.append(dt.averageGain(m.monk2,m.attributes[x]))
  gain_monk3.append(dt.averageGain(m.monk3,m.attributes[x]))

print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6"
print "Monk1: ","\t".join(["%.7f"%y for y in gain_monk1])
print "Monk2: ","\t".join(["%.7f"%y for y in gain_monk2])
print "Monk3: ","\t".join(["%.7f"%y for y in gain_monk3])

print
print "------------------------------"

print "-------- Assignment 3 --------"
print 

partition1 = dt.select(m.monk1,m.attributes[4],1)
partition2 = dt.select(m.monk1,m.attributes[4],2)
partition3 = dt.select(m.monk1,m.attributes[4],3)
partition4 = dt.select(m.monk1,m.attributes[4],4)

gain_partition1  = []
gain_partition2  = []
gain_partition3  = []
gain_partition4  = []

for x in range(0, 6):
  gain_partition1.append(dt.averageGain(partition1,m.attributes[x]))
  gain_partition2.append(dt.averageGain(partition2,m.attributes[x]))
  gain_partition3.append(dt.averageGain(partition3,m.attributes[x]))
  gain_partition4.append(dt.averageGain(partition4,m.attributes[x]))
def printNumTrueFalse(datasets):
    # For a list of datasets, print the number of true and false
    for i in range(0, len(datasets)):
        print("Monk"+str(i+1)+" "+
              "[#tot="+str(len(datasets[i]))+"] "+
              "[#true="+str(getNumTrue(datasets[i]))+"] "+
              "[#false="+str(getNumFalse(datasets[i]))+"]")
#Main
dataset = m.monk2
available = [True]*len(m.attributes)
firstSplit = getBestAttribute(dataset, m.attributes, available)
print("Firstsplit = "+str(firstSplit))
print("-----")
available[firstSplit] = False
sets = []
for i in range(0, len(m.attributes[firstSplit].values)):
    sets.append(d.select(dataset, m.attributes[firstSplit], m.attributes[firstSplit].values[i]))

for i in range(0, len(sets)):
    subSets = []
    splitOn = getBestAttribute(sets[i], m.attributes, available)
    print("Second split = "+str(splitOn))
    for j in range(0, len(m.attributes[splitOn].values)):
        subSets.append(d.select(sets[i], m.attributes[i], m.attributes[i].values[j]))
    for s in subSets:
        print(d.mostCommon(s))

    print("----")

def main():
	print ("Entropy monk1")
	entropy1 = tree.entropy(data.monk1)
	print (entropy1)
	print ("\n")

	print ("Entropy monk2")
	entropy2 = tree.entropy(data.monk2)
	print (entropy2)
	print ("\n")

	print ("Entropy monk3")
	entropy3 = tree.entropy(data.monk3)
	print (entropy3)
	print ("\n")

	informationGain(data)

	#COMPUTING ENTROPY FOR SUBSET, WhY 0?!
	monk1Tree = tree.buildTree(data.monk1, data.attributes)
	#draw.drawTree(monk1Tree)
	#print(tree.bestAttribute(data.monk3, data.attributes))
	subSet = tree.select(data.monk1, data.attributes[4], 1)

	# newEntropy = tree.entropy(subSet)
	# print ("SubSet")
	# print (newEntropy)
	#END

	n = 0
	sumList = np.array([0.0] * 6)
	l1 = []
	l2 = []
	l3 = []
	l4 = []
	l5 = []
	l6 = []

	for x in range(100):
		errorList = np.array(pruneTree(data.monk1, data.monk1test))
		sumList += errorList
		l1.append(errorList[0])
		l2.append(errorList[1])
		l3.append(errorList[2])
		l4.append(errorList[3])
		l5.append(errorList[4])
		l6.append(errorList[5])

	finalList = sumList/100
	stdDevList = [np.std(l1),np.std(l2),np.std(l3),np.std(l4), np.std(l5),np.std(l6)]  

	print(finalList)
	print(stdDevList)

	line1, = plt.plot(finalList, label="Monk1 means", marker='o')
	# Create a legend for the first line.
	first_legend = plt.legend(handles=[line1], loc=1)

	x = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
	# create an index for each tick position
	xi = [i for i in range(0, len(x))]

	plt.xticks(xi, x)
	plt.ylabel('Mean Errors')
	plt.xlabel('Fractions')
	plt.show()
Example #36
0
"--Answer to Assignment 2"
print(informationGain[2], "\n")

# print(t.bestAttribute(m.monk1, m.attributes))

""" 
Attribute a5 has the largest information gain meaning that it reduces the 
uncertainty the most. Thus, it should be used for splitting at the root node.
"""


"5 BUILDING DECISION TREES"
sel = []
for i in range(4):  # splits data into subset according to attr a5
    sel.append(t.select(m.monk1, m.attributes[4], m.attributes[4].values[i]))

# print(sel)
sub = []
mC = []
for subset in sel:
    for i in [0, 1, 2, 3, 5]:
        sub.append(t.averageGain(subset, m.attributes[i]))
    mC.append(t.mostCommon(subset))

    # print(sub)
    sub = []

"Highest information gain on second level of the tree # 2 - A4 , 3 - A6 , 4 - A1 #"

"""Assignment 3"""
Example #37
0
import monkdata as m
import dtree as dtree

foo = dtree.select(m.monk1, m.attributes[4], 3)
print '-- information gain of monk-1 dataset: --'
print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0]))
print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1]))
print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2]))
print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3]))
print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5]))

foo = dtree.select(m.monk1, m.attributes[4], 1)
print '-- is a_5 with value = 1 a majority class? --'
print dtree.mostCommon(foo)
Example #38
0
        # entropy of subset is how predictable is an unlabeled record to be positive or negative and has attribute 1
        # weighted is the sum of entropies for attribute 1
        weighted += entropy(subset) * len(subset)
    # entropy(monk1) - entropy(subset)
    # the less entropy of subset is, the better it's at classifiying data. So, the higher the information gain is the better it's to use this attribute to split.
    return entropy(dataset) - weighted / len(dataset)


def select(dataset, attribute, value):
    "Return subset of data samples where the attribute has the given value"
    return [x for x in dataset if x.attribute[attribute] == value]


#test for how select function works
for v in m.attributes[0].values:
    subset = dtree.select(m.monk1, m.attributes[0], v)

#information Gain calculation

Data = np.array([np.array(m.monk1), np.array(m.monk2), np.array(m.monk3)])
'''
Dictionary where keys are the IG values and values are the attribute names 
'''
IG = {}
for i in range(3):
    for j in range(len(m.attributes)):
        IG[dtree.averageGain(Data[i], m.attributes[j])] = m.attributes[j].name
    #max IG
    maxIGKey = max(IG.keys())
    #the chosen attribute that has the highest value of IG such as A5 in monk1
    maxIGValue = IG[maxIGKey]
Example #39
0
__author__ = 'swebo_000'

import monkdata as m
import dtree as d
#import drawtree

print("3. Building decision tree: \n")

print("Subset division of MONK-1 at attribute 5: \n")
subsets = [];
for x in range(0, len(m.attributes[4].values)):
    subsets.append(d.select(m.monk1, m.attributes[4], x+1))

for set in subsets:
    gain = 0;
    maxgain = 0;
    bestatr = 0;
    print("Value: %d" % (subsets.index(set) + 1))
    print("Most common: " + str(d.mostCommon(set)))
    for x in range(0, len(m.attributes)):
        gain = d.averageGain(set, m.attributes[x]);
        print("Attribute A%d: %f" % (x+1, gain))
        if(gain > maxgain):
            maxgain = gain;
            bestatr = x;
    print("Attribute with best information gain: A%d \n" % (bestatr + 1));
    maxgain = 0
    bestatr = 0;


print("MONK-1:")
def splittingHighestAttribute():
     a5=m.attributes[4]
     for set in trainingset:
         for attibuteValues in a5.values:
             subset=d.select(set.dataset ,a5,attibuteValues)
             getAverageInformationGain(subset,set.name +"on A5 = "+str(attibuteValues))  
#     ag3.append(gain3)

# print(ag1) #a5
# print(ag2) #a5
# print(ag3) #a2 a5

#**********************************
# Assignment 5

a = dtree.bestAttribute(mdata.monk1, mdata.attributes)
attributesLeft = [x for x in mdata.attributes if x != a]
#print(a,attributesLeft) #a5

subsets = []
for v in a.values:
    temp = dtree.select(mdata.monk1, a, v)
    subsets.append(temp)

ag_in2level = []
subsets_ag = []
#print(len(a.values))
for subset in subsets:
    for i in range(len(attributesLeft)):
        gain1 = dtree.averageGain(subset, attributesLeft[i])
        ag_in2level.append(gain1)
    subsets_ag.append(ag_in2level)
    ag_in2level = []
#print(subsets_ag)


def Tree(dataset, attributes, maxdepth=3):
Example #42
0
# output print
print '-------- Assignment 3 --------'
print 'information gain:'
print 'monk 1 (a1->a6): ' + str(np.transpose(Ga_m1))
print 'monk 2 (a1->a6): ' + str(np.transpose(Ga_m2))
print 'monk 3 (a1->a6): ' + str(np.transpose(Ga_m3))
print ''


## Assignment 5: build decision tree
Ga_m11 = np.empty([6,4], dtype = float)
en_m11 = np.empty([1,4], dtype = float)

for i in range(0,4):
    en_m11[0,i] = d.entropy(d.select(m.monk1, m.attributes[4], (i+1)))
    for j in range(0,6):
        Ga_m11[j,i] = d.averageGain(d.select(m.monk1, m.attributes[4], (i+1)), m.attributes[j])

# majority class
#mc = d.mostCommon(d.select(m.monk1, m.attributes[4], 1))

t1 = d.buildTree(m.monk1, m.attributes)
t2 = d.buildTree(m.monk2, m.attributes)
t3 = d.buildTree(m.monk3, m.attributes)

# output print
print '-------- Assignment 5 --------'
print 'decision tree of monk1:'
print(t1)
print 'train set error: ' + str(round((1-d.check(t1, m.monk1))*100, 2)) + '%'
Example #43
0
    Ga_m3[i] = d.averageGain(m.monk3, m.attributes[i])

# output print
print '-------- Assignment 3 --------'
print 'information gain:'
print 'monk 1 (a1->a6): ' + str(np.transpose(Ga_m1))
print 'monk 2 (a1->a6): ' + str(np.transpose(Ga_m2))
print 'monk 3 (a1->a6): ' + str(np.transpose(Ga_m3))
print ''

## Assignment 5: build decision tree
Ga_m11 = np.empty([6, 4], dtype=float)
for i in range(0, 4):
    for j in range(0, 6):
        Ga_m11[j,
               i] = d.averageGain(d.select(m.monk1, m.attributes[4], (i + 1)),
                                  m.attributes[j])

# majority class
#mc = d.mostCommon(d.select(m.monk1, m.attributes[4], 1))

t1 = d.buildTree(m.monk1, m.attributes)
t2 = d.buildTree(m.monk2, m.attributes)
t3 = d.buildTree(m.monk3, m.attributes)

# output print
print '-------- Assignment 5 --------'
print 'decision tree of monk1:'
print(t1)
print 'train set error: ' + str(round(
    (1 - d.check(t1, m.monk1)) * 100, 2)) + '%'
print(" ")
print("Information gain for the MONK3 dataset")
for i in range(0, 6):
    print(" Info Gain ", m.attributes[i], ":",
          d.averageGain(monk3, m.attributes[i]))
print(" ")

print("#----------------Assignment 5 ----------------#")
print(" ")
# Splitting the tree for MONK1 data
print("#---- For MONK1 dataset -----#")
for i in (1, 2, 3, 4):
    for j in range(0, 6):
        print("Information gain for split for A5 at value ", i, " at",
              m.attributes[j], " :",
              d.averageGain(d.select(monk1, A5, i), m.attributes[j]))
    print(" ")

print(" ")
# Build Tree using PyQT graph

# MONK1 Tree
#draw.drawTree(d.buildTree(monk1, m.attributes))
print("Classification error for dataset [in fraction]")
print("MONK1 with train data ",
      (1 - d.check(d.buildTree(monk1, m.attributes), monk1)))
print("MONK1 with test  data ",
      (1 - d.check(d.buildTree(monk1, m.attributes), monktest1)))
# MONK2 Tree
#draw.drawTree(d.buildTree(monk2, m.attributes))
print("MONK2 with train data ",
Example #45
0
    for i in range(len(m.attributes)):
        if i not in exclude:
            attribute_key = "A" + str(i + 1)
            attribute = m.attributes[i]
            avg_gain = d.averageGain(dataset, attribute)
            avg_gain_dict[attribute_key] = avg_gain

    return avg_gain_dict


d.selected_attribute = "A5"
print(
    "\nAssignment 5.1 a) - Split monk1 into subsets according to selected attribute {}\n"
    .format(d.selected_attribute))
idx = int(d.selected_attribute[-1]) - 1
subset_A5_true = d.select(m.monk1, m.attributes[idx], True)
subset_A12346 = [x for x in m.monk1 if x not in subset_A5_true]

print("\nAssignment 5.1 b) - Where do we find the highest average gain?")
IG_dict_A12346 = get_avg_gain_dict_exclude(subset_A12346, exclude=[idx])
IG_dict_A12346 = sorted(IG_dict_A12346.items(),
                        key=lambda kv: kv[1],
                        reverse=True)
# print("\n", IG_dict_A12346)
[print(tuple[0], ":    ", tuple[1]) for tuple in IG_dict_A12346]
print()

d.selected_attribute = "A1"
print(
    "\nAssignment 5.1 c) - Split into further subsets according to selected attribute {}\n"
    .format(d.selected_attribute))
import monkdata as m
import dtree as dt
import drawtree as draw

entropy = dt.entropy(m.monk1)
best_gain = 0
for attribute in m.attributes:
    gain = dt.averageGain(m.monk1, attribute)
    if gain > best_gain:
        best_gain = gain
        best_attribute = attribute


for v in best_attribute.values:
    subset = dt.select(m.monk1, best_attribute, v)
    majority_class = dt.mostCommon(subset)

values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values}
print(best_attribute, values)
draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))