Beispiel #1
0
def main():
    trainset, trainRaw = makeData(training, labels)
    testset, testRaw = makeData(testing, labels)
    trainLabels = [item[-1] for item in trainRaw]
    testLabels = [item[-1] for item in testRaw]

    #print(gf.gainE(trainset,labels[5], labels[-1]))

    #    mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None)
    #    printTree(mytree)
    print("Running the decision tree algorithm on the 'Cars' dataset.")

    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 7):
            currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i,
                                  item, None)

            trainPred = [id3.predict(currentTree, x, labels) for x in trainset]
            testPred = [id3.predict(currentTree, x, labels) for x in testset]

            trainAcc = id3.accuracy(trainPred, trainLabels)
            testAcc = id3.accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
Beispiel #2
0
def main():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)

    train_set = pd.read_csv('data/close1.csv', sep=';')
    test_set = pd.read_csv('data/close1_test2.csv', sep=';')

    print train_set.to_html(classes='table')
    print test_set.to_html(classes='table')

    tree = {}
    target_field = 'Доступность'
    fields = list(train_set.columns.values)
    fields.remove(target_field)

    id3.fit(train_set, tree, target_field, fields)

    # json_str = json.dumps(tree)
    # print(json_str)

    print('Train Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, train_set, target_field)))
    print('Test Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, test_set, target_field)))
    # plot_tree(tree, u'Decision Tree')

    # filter using knn
    filtered_ts = knn.filter_anomalies(train_set, 'Тип объекта', 3, 0.15)
    id3.fit(filtered_ts, tree, target_field, fields)

    # json_str = json.dumps(tree)
    # print(json_str)

    print('Train Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, filtered_ts, target_field)))
    print('Test Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, test_set, target_field)))
Beispiel #3
0
def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    for element in trainset:
        element['weight'] = 1 / len(trainset)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])

    train = []
    test = []
    for i in [1, 2, 4, 8, 16, 20, 21]:
        for element in trainset:
            element['weight'] = 1 / len(trainset)
        #currentTree = id3(trainset, labels, label_attr, labels[-1], i, 'entropy', None)
        treeList = boost(trainset, labels, label_attr, labels[-1], i,
                         trainLabels)
        trainPred = []
        testPred = []
        for entry in trainset:
            trainPred.append(
                boostGuess(treeList, entry, labels, label_attr['outcome']))
        for entry in testset:
            testPred.append(
                boostGuess(treeList, entry, labels, label_attr['outcome']))

        trainAcc = accuracy(trainPred, trainLabels)
        if trainAcc < 0.3:
            trainAcc = 1 - trainAcc
        testAcc = accuracy(testPred, testLabels)
        if testAcc < 0.3:
            testAcc = 1 - testAcc
        train.append(trainAcc)
        test.append(testAcc)
        print("Boosted decision tree with", i,
              'iterations, has a training accuracy of', trainAcc)
        print("Boosted decision tree with", i,
              'iterations, has a testing accuracy of', testAcc)

    showtree = boost(trainset, labels, label_attr, labels[-1], 100,
                     trainLabels)
    stumpTrainAcc = []
    stumpTestAcc = []
    for tree in showtree:
        stumpTrainPred = []
        stumpTestPred = []
        for entry in trainset:
            stumpTrainPred.append(predict(tree['tree'], entry, labels))
        for entry in testset:
            stumpTestPred.append(predict(tree['tree'], entry, labels))
        stumpTrainAcc.append(accuracy(stumpTrainPred, trainLabels))
        #        print(stumpTrainAcc[-1])
        stumpTestAcc.append(accuracy(stumpTestPred, testLabels))
    print('Training stumps', stumpTrainAcc)
    print('Testing stumps', stumpTestAcc)
Beispiel #4
0
def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    fix = fixUnknown(trainset, labels)
    trainsetU = replaceUnknown(trainset, labels, fix)
    testsetU = replaceUnknown(testset, labels, fix)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])
    print(
        "Running decision tree algorithm on the bank dataset with unknown values"
    )
    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainset, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainset]
            testPred = [predict(currentTree, x, labels) for x in testset]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

    print(
        "Running decision tree algorithm on the bank dataset with unknown's replaced"
    )
    print("\n \n \n \n \n")

    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainsetU, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainsetU]
            testPred = [predict(currentTree, x, labels) for x in testsetU]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
Beispiel #5
0
def main():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)

    train_set = pd.read_csv('data/close1.csv', sep=';')
    test_set = pd.read_csv('data/close1_test2.csv', sep=';')

    print train_set.to_html(classes='table')
    print test_set.to_html(classes='table')

    tree = {}
    target_field = 'Доступность'
    fields = list(train_set.columns.values)
    fields.remove(target_field)

    id3.fit(train_set, tree, target_field, fields)

    # json_str = json.dumps(tree)
    # print(json_str)

    print('Train Set Accuracy: {} ({}/{})'.format(
        *id3.accuracy(tree, train_set, target_field)))
    print('Test Set Accuracy: {} ({}/{})'.format(
        *id3.accuracy(tree, test_set, target_field)))
    # plot_tree(tree, u'Decision Tree')

    # filter using knn
    filtered_ts = knn.filter_anomalies(train_set, 'Тип объекта', 3, 0.15)
    id3.fit(filtered_ts, tree, target_field, fields)

    # json_str = json.dumps(tree)
    # print(json_str)

    print('Train Set Accuracy: {} ({}/{})'.format(
        *id3.accuracy(tree, filtered_ts, target_field)))
    print('Test Set Accuracy: {} ({}/{})'.format(
        *id3.accuracy(tree, test_set, target_field)))
Beispiel #6
0
def main_slow():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    for element in trainset:
        element['weight'] = 1

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    treenums = [1, 100, 200, 300, 500, 800, 1000]
    for k_val in [2, 4, 6]:
        print("When you set your attribute subset values to", k_val,
              "you get the following.")
        trainAcc = []
        testAcc = []
        for num in treenums:
            treelist = []
            for i in range(num):
                newTraining = rand.choices(trainset, k=len(trainset))
                newTree = rand_id3(newTraining, labels, label_attr, labels[-1],
                                   18, 'entropy', None, k_val)
                treelist.append(newTree)
            trainPred = []
            testPred = []
            for entry in trainset:
                thing = bag_guess(treelist, entry, labels,
                                  label_attr['outcome'])
                trainPred.append(thing)
            for entry in testset:
                thing = bag_guess(treelist, entry, labels,
                                  label_attr["outcome"])
                testPred.append(thing)
            trainAcc.append(accuracy(trainPred, trainLabels))
            testAcc.append(accuracy(testPred, testLabels))
            print(trainAcc, 'train accuracy')
            print(testAcc, 'test accuracy')

    tree_preds = []
    basics = []
    for i in range(100):
        train_i = rand.choices(trainset, k=1000)
        treelist_i = []
        for j in range(1000):
            train_j = rand.choices(train_i, k=1000)
            newTree = rand_id3(train_j, labels, label_attr, labels[-1], 18,
                               'entropy', None, 6)
            treelist_i.append(newTree)
#            if j%100 == 0:
#                print("100 more trees from set", i, 'have been trained.  iteration = ',j)
        tree_preds.append(treelist_i)
        basics.append(treelist_i[0])
        print("Tree set", i, "has been trained")

    singleVar = []
    singleBias = []
    singleMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for tree in basics:
            guess = predict(tree, entry, labels)
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        singleMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        singleBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        singleVar.append(var)

    bagVar = []
    bagBias = []
    bagMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for trees in tree_preds:
            guess = bag_guess(trees, entry, labels, label_attr['outcome'])
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        bagMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        bagBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        bagVar.append(var)

    sVariance = mean(singleVar)
    sBias = mean(singleBias)
    sMSE = sBias + sVariance
    print("The bias and the variance of the single trees are: Variance:",
          sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE)
    bVariance = mean(bagVar)
    bBias = mean(bagBias)
    bMSE = bBias + bVariance
    print("The bias and the variance of the bagged trees are: Variance:",
          bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)
Beispiel #7
0
step_size = len(trainset[0]) // 10

for length in range(10, upper_limit, step_size):
    print('Number of Training Instances:', length)
    outputfile_tr.write('Number of Training Instances: ' + str(length) + '\n')

    pruned_accuracies = []
    unpruned_accuracies = []

    for experiment in range(5):
        train = trainset[experiment][:length]
        test = testset[experiment]

        tree = id3.ID3(train, default)
        id3.prune(tree, validation_set)
        acc = id3.accuracy(tree, test)
        pruned_accuracies.append(acc)

        tree = id3.ID3(train, default)
        acc = id3.accuracy(tree, test)
        unpruned_accuracies.append(acc)

    avg_pruned_accuracies = sum(pruned_accuracies) / len(pruned_accuracies)
    avg_unpruned_accuracies = sum(unpruned_accuracies) / len(
        unpruned_accuracies)

    print('  Accuracy for Pruned tree: ' + str(avg_pruned_accuracies))
    print('Accuracy for Unpruned tree: ' + str(avg_unpruned_accuracies))

    outputfile_tr.write('  Accuracy for Pruned tree: ' +
                        str(avg_pruned_accuracies) + '\n')