Example #1
0
def k_fold_cross_validation(data_set, k, pruning=False):
    accuracy = np.zeros(k)
    tree = cls.DecisionTreeClassifier()
    best_tree = cls.DecisionTreeClassifier()
    max_accuracy = 0
    trees = []
    prePruneConfMatrix = []
    postPruneConfMatrix = []

    for i in range(1, k + 1):
        # Split Data into training and testing data
        split = split_set(data_set, k, i, pruning)
        testing = split[0]
        training = split[1]
        training_x = training[:, :-1]
        training_y = [chr(i) for i in training.T[-1]]

        # Train tree
        testing_y = [chr(i) for i in testing.T[-1]]

        trees.append(cls.DecisionTreeClassifier())
        trees[i - 1].train(training_x, training_y)
        tree = trees[i - 1]

        if pruning:
            predictions = tree.predict(testing)
            confusion = ev.Evaluator.confusion_matrix(predictions, testing_y)
            prePruneConfMatrix.append(confusion)
            validation = split[2]
            tree.prune(
                (validation[:, :-1], [chr(i) for i in validation[:, -1]]))

        predictions = tree.predict(testing)

        # Evaluation metrics
        eval = ev.Evaluator()
        testing_y = [chr(i) for i in testing.T[-1]]
        confusion = eval.confusion_matrix(predictions, testing_y)
        accuracy[i - 1] = eval.accuracy(confusion)

        # Save tree with best accuracy
        confusion = ev.Evaluator.confusion_matrix(predictions, testing_y)
        postPruneConfMatrix.append(confusion)
        accuracy[i - 1] = ev.Evaluator.accuracy(confusion)

        if accuracy[i - 1] > max_accuracy:
            best_tree = trees[i - 1]
            max_accuracy = accuracy[i - 1]

    if pruning:
        print("Pre pruning metrics")
        analyseListOfConfMatrix(prePruneConfMatrix)
        print("Post pruning results")
        analyseListOfConfMatrix(postPruneConfMatrix)

    return accuracy, best_tree, trees
Example #2
0
    def k_fold_cv(self, x, y, k, scoring):

        k = len(y) if k > len(y) else k

        if k < 2:
            print("k must be at least 2 as the minimum number of splits is 2")
            exit(0)

        splits = self.k_split(x, y, k)
        scores = np.array([])
        top_eval = 0
        top_model = None

        for idx, fold in enumerate(splits):

            test_set, train_set = np.array([]), np.array([])

            x_test = fold.T[:-1]
            y_test = fold.T[-1]

            for i in range(len(splits)):
                if i != idx:
                    if len(train_set) > 0:
                        train_set = np.vstack((train_set, splits[i]))
                    else:
                        train_set = splits[i]

            x_train = train_set.T[:-1]
            y_train = train_set.T[-1]

            model = cl.DecisionTreeClassifier()
            model.train(x_train.T, y_train)
            conf = self.confusion_matrix(model.predict(x_test.T), y_test)
            avg = None

            if scoring == "accuracy":
                avg = self.accuracy(conf)
            elif scoring == "precision":
                _, avg = self.precision(conf)
            elif scoring == "recall":
                _, avg = self.recall(conf)
            elif scoring == "f1":
                _, avg = self.f1_score(conf)
            else:
                print(
                    "Invalid scoring metric. Please enter accuracy, precision, recall or f1"
                )
                exit(0)

            if top_eval < avg:
                top_eval = avg
                top_nodel = model

            scores = np.append(scores, avg)

        return np.mean(scores), np.std(scores), top_model
Example #3
0
    def print_eval(train_data, test_data):
        data = dr.parseFile(train_data)
        x = data[0]
        y = data[1]

        tree = cp.DecisionTreeClassifier()
        tree.train(x, y)
        test = dr.parseFile(test_data)
        xtruth = test[0]
        ytruth = test[1]
        test = dr.mergeAttributesAndCharacteristics(xtruth, ytruth)
        predictions = tree.predict(test)
        e = Evaluator()
        a = e.confusion_matrix(ytruth, predictions)
        print("Confusion" + "\n" + str(a))
        print("Accuracy: " + str(e.accuracy(a)))
        print("Recall: " + str(e.recall(a)))
        print("Precision: " + str(e.precision(a)))
        print("F1score: " + str(e.f1_score(a)))
Example #4
0
repeats = 3

pathToSimple1 = './data/simple1.txt'
pathToSimple2 = './data/simple2.txt'
pathToTest = './data/test.txt'
pathToToy = './data/toy.txt'
pathToToy2 = './data/toy2.txt'
pathToFull = './data/train_full.txt'
pathToNoisy = './data/train_noisy.txt'
pathToSub = './data/train_sub.txt'
pathToValid = './data/validation.txt'

dataset = ds.ClassifierDataset()
dataset.initFromFile(pathToFull)

dtc = cs.DecisionTreeClassifier()
print("FULL")
for i in range(repeats):
    cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time')

dataset.initFromFile(pathToSub)
print("\n\n\n=====\n\n\n\n")
print("SUB")
for i in range(repeats):
    cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time')

dataset.initFromFile(pathToNoisy)
print("\n\n\n=====\n\n\n\n")
print("NOISY")
for i in range(repeats):
    cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time')
Example #5
0
        return np.mean(scores), np.std(scores), top_model


if __name__ == "__main__":
    # ds = cl.Dataset()
    # ds.read("data/toy.txt")
    #
    # eval = Evaluator()
    # print(eval.k_fold_cv(ds.features, ds.labels, 4, "recall"))
    eval = Evaluator()
    pickles = [
        "data/model_full.pickle", "data/model_noisy.pickle",
        "data/model_sub.pickle"
    ]
    models = [
        cl.DecisionTreeClassifier(),
        cl.DecisionTreeClassifier(),
        cl.DecisionTreeClassifier()
    ]
    test = cl.Dataset("data/test.txt")
    model, pickle = models[2], pickles[2]
    model.deserialise_model(pickle)
    preds = model.predict(test.features)
    confusion = eval.confusion_matrix(preds, test.labels)
    print(confusion)
    print()
    print(eval.accuracy(confusion))
    print()
    print(eval.precision(confusion))
    print()
    print(eval.recall(confusion))
Example #6
0
    #                  Question 3.3
    k = 10
    accuracy, best_tree, k_trees = k_fold_cross_validation(full_data, k)

    # Print Accuracies and Standard Deviations for Question 3.3
    print("Accuracy: " + str(round(accuracy.mean(), 4)))
    print("Standard Deviation: " + str(round(accuracy.std(), 4)))

    # Question 3.4
    x = full_data[:, :-1]
    y = [chr(i) for i in full_data.T[-1]]
    testing_y = [chr(i) for i in test_data.T[-1]]

    # Train tree on train_full.txt
    full_trained = cls.DecisionTreeClassifier()
    full_trained.train(x, y)

    # Generate predictions
    full_predict = full_trained.predict(test_data)
    cross_predict = best_tree.predict(test_data)

    # Print results
    print_results(full_predict, testing_y, "Fully Trained")
    print_results(cross_predict, testing_y, "K-Fold Trained")

    #                    Question 3.5

    # Get predictions for each tree trained in 3.3, k_trees
    k_predict = k_decision_trees(test_data, k, k_trees)
    print_results(k_predict, testing_y, "K-Fold Mode Predict")