Example #1
0
def prune(node, dataset):
    """
    Recursively prunes a decision tree classifier

    Parameters
    ----------
    root : Node
        the node being considered for pruning
    dataset: Dataset
        the dataset being used for pruning
    """
    if not node.is_leaf():
        two_leaves = node.right.is_leaf() and node.left.is_leaf()

        if not two_leaves:
            if (node.left.is_node()):
                prune(node.left, dataset)
            if (node.right.is_node()):
                prune(node.right, dataset)

        two_leaves = node.right.is_leaf() and node.left.is_leaf()

        if two_leaves:
            annotation = dataset.labels
            attributes = dataset.attributes

            # Try and prune current node
            # Calculate accuracy before pruning
            evaluator = Evaluator()
            predictions_before = tree.predict(attributes)
            confusion = evaluator.confusion_matrix(predictions_before, annotation)
            accuracy_before = evaluator.accuracy(confusion)

            # Store leaves and rule temporarily
            temp_label_left = node.left.label
            temp_label_right = node.right.label
            temp_rule = node.rule

            # Prune current node
            node.label = node.majority_label
            node.left.label = None
            node.right.label = None
            node.rule = None

            # Calculate accuracy after pruning
            predictions_after = tree.predict(attributes)
            confusion = evaluator.confusion_matrix(predictions_after, annotation)
            accuracy_after = evaluator.accuracy(confusion)

            # Restore node if accuracy dropped
            if (accuracy_after < accuracy_before):
                node.left.label = temp_label_left
                node.right.label = temp_label_right
                node.label = None
                node.rule = temp_rule
Example #2
0
def cross_validation(x, y, k):

    xpart, ypart = data_split(x, y, k)
    accuracy = np.zeros(k)
    classifiers = np.empty(k, dtype=object)

    for i in range(k):

        # split data correctly
        xval = xpart[i]
        yval = ypart[i]
        xtrain = np.delete(xpart, i, 0).reshape((k - 1) * xval.shape[0],
                                                xval.shape[1])
        ytrain = np.delete(ypart, i, 0).reshape((k - 1) * xval.shape[0], 1)

        # train on training slice
        classifiers[i] = DecisionTreeClassifier()
        classifiers[i] = classifiers[i].train(xtrain, ytrain)

        #predict for test class
        predictions = classifiers[i].predict(xval)

        # validate using statistics
        eval = Evaluator()
        confusion = eval.confusion_matrix(predictions, yval)
        accuracy[i] = eval.accuracy(confusion)

    return accuracy, classifiers
Example #3
0
 def getAccuracy(self):
     evaluator = Evaluator()
     predictions = self.decisionTreeClassifier.predict(
         self.validationAttrib)
     c_matrix = evaluator.confusion_matrix(predictions,
                                           self.validationLabel)
     return evaluator.accuracy(c_matrix)
Example #4
0
def old_test():
    # data_read("data/toy.txt")
    prediction = ["A", "B"]
    annotation = ["A", "A"]
    class_labels = ["B", "A"]
    obj = Evaluator()
    matrix = obj.confusion_matrix(prediction, annotation, class_labels)
    print(str.format('{0:.15f}', obj.accuracy(matrix)))
    print(obj.precision(matrix))
    print(obj.recall(matrix))
    print(obj.f1_score(matrix))
Example #5
0
    def prune_tree_reduced_error(self, tree, x_val, y_val):
        """
        Function to accept prunes which increase the tree's accuracy, otherwise
        ignore
        Args:
            tree (dict) - tree to be pruned
            x_val (2D array) - 2D array of attributes of validation set where
                each row is a differnt sample and each column is a differnt
                attribute
            y_val (1D array) - 1D array of correct labels for x_val validation
                data
        Output:
            tree (dict or str) - tree pruned such that any additional pruning
                would lower predictive accuracy on validation set.
        """
        classifier = DecisionTreeClassifier()
        classifier.is_trained = True
        predictions = classifier.predict(x_val)
        eval = Evaluator()
        confusion = eval.confusion_matrix(predictions, y_val)
        root_accuracy = eval.accuracy(confusion)
        print("Results on Validation set")
        print("Original Accuracy: ", root_accuracy)

        is_pruned = True
        while (is_pruned and isinstance(tree, dict)):
            #make copy of tree then attempt to prune copy
            tree_copy = copy.deepcopy(tree)
            (is_pruned, tree_copy, tree) = self.prune(tree_copy, tree)
            if is_pruned:
                #compare accuracy of pruned tree to original
                new_predictions = classifier.predict(x_val, tree_copy)
                new_confusion = eval.confusion_matrix(new_predictions, y_val)
                new_accuracy = eval.accuracy(new_confusion)
                if new_accuracy >= root_accuracy:
                    #if greater or equal accuracy make tree = copy
                    root_accuracy = new_accuracy
                    tree = copy.deepcopy(tree_copy)

        print("New Accuracy: ", root_accuracy)
        return tree
Example #6
0
def cross_validation(k, filename):
    """
    Performs cross validation on a dataset

    Parameters
    ----------
    k : int
        number of times dataset is split
    filename : string
        name of the file to load the dataset
    
    Returns
    -------
    list of ints
        containing the accuracies of each split
    int
        global error estimate
    """
    file_path = "./data/" + filename
    dataset = np.loadtxt(file_path, dtype=str, delimiter=',')
    np.random.shuffle(dataset)
    subsets = np.array_split(dataset, k)

    accuracies = []

    for i in range(k):
        train = np.delete(subsets, i, axis=0)
        train = np.concatenate(train)
        train_att = train[:, :-1].astype(int)
        train_labels = train[:, -1]

        test = subsets[i]
        test_att = test[:, :-1].astype(int)
        test_labels = test[:, -1]

        tree = DecisionTreeClassifier()
        tree = tree.train(train_att, train_labels)
        prediction = tree.predict(test_att)

        evaluator = Evaluator()
        confusion = evaluator.confusion_matrix(prediction, test_labels)
        a = evaluator.accuracy(confusion)
        accuracies.append(a)

    global_error_estimate = np.mean(accuracies)
    np.set_printoptions(formatter={'float': '{: 0.4f}'.format})

    return accuracies, global_error_estimate
Example #7
0
def print_stats(predictions, y_test):

    eval = Evaluator()
    confusion = eval.confusion_matrix(predictions, y_test)

    accuracy = eval.accuracy(confusion)
    precision = eval.precision(confusion)
    recall = eval.recall(confusion)
    f1 = eval.f1_score(confusion)

    print("confusion", confusion)
    print("accuracy", accuracy)
    print("precision", precision)
    print("recall", recall)
    print("f1", f1)

    return
Example #8
0
def main():
    print("Loading the training dataset...")
    x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2],
                  [5, 2, 6]])

    y = np.array(["A", "A", "A", "C", "C", "C"])

    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier = classifier.train(x, y)

    print("Loading the test set...")

    x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]])

    y_test = np.array(["A", "A", "C", "C"])

    predictions = classifier.predict(x_test)
    print("Predictions: {}".format(predictions))

    classes = ["A", "C"]

    print("Evaluating test predictions...")
    evaluator = Evaluator()
    confusion = evaluator.confusion_matrix(predictions, y_test)

    print("Confusion matrix:")
    print(confusion)

    accuracy = evaluator.accuracy(confusion)
    print()
    print("Accuracy: {}".format(accuracy))

    (p, macro_p) = evaluator.precision(confusion)
    (r, macro_r) = evaluator.recall(confusion)
    (f, macro_f) = evaluator.f1_score(confusion)

    print()
    print("Class: Precision, Recall, F1")
    for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)):
        print("{}: {:.2f}, {:.2f}, {:.2f}".format(classes[i], p1, r1, f1))

    print()
    print("Macro-averaged Precision: {:.2f}".format(macro_p))
    print("Macro-averaged Recall: {:.2f}".format(macro_r))
    print("Macro-averaged F1: {:.2f}".format(macro_f))
Example #9
0
def test_DecisionTreeClassifier(dataset_filename: str = "toy.txt",
                                should_load_file=False):
    # train
    extless_filename = dataset_filename[:-4]
    start = time.time()
    saved_tree_file = None
    if should_load_file:
        saved_tree_file = "tree_" + extless_filename + ".obj"
    cl = DecisionTreeClassifier(saved_tree_file=saved_tree_file)
    dataset = data_read("data/" + dataset_filename)
    unique_lbls = np.unique([e.label for e in dataset.entries])
    x, y = dataset.shim_to_arrays()
    cl.train(x, y)
    cl.tree.save_tree("tree_" + extless_filename + ".obj")
    visualize_tree(cl.tree,
                   save_filename=f"visualize_tree_{extless_filename}.txt",
                   max_depth=8)
    duration = time.time() - start
    print("duration: ", duration)

    # predict
    test_dataset = data_read("data/test.txt")
    x_test, y_test = test_dataset.shim_to_arrays()
    preds = cl.predict(x_test)
    # preds = [random.choice('ACEGOQ')
    #  for _ in range(len(y_test))]  # testing random
    # evaluate
    ev = Evaluator()
    matrix = ev.confusion_matrix(preds, y_test, unique_lbls)
    print("real accuracy: ", accuracy_score(y_test, preds))
    print("\nour calc accuracy: ", str.format('{0:.15f}', ev.accuracy(matrix)))
    print("\n precision:", precision_score(y_test, preds, average="macro"))
    print("\n our precision: ", ev.precision(matrix))
    print("\nreal recall: ", recall_score(y_test, preds, average="macro"))
    print("\n our recall: ", ev.recall(matrix))
    print("\n f1_score", f1_score(y_test, preds, average="macro"))
    print("\n f1_score: ", ev.f1_score(matrix))
    print(matrix)
Example #10
0
    y_test = np.array(["A", "A", "C", "C"])

    predictions = classifier.predict(x_test)
    print("Predictions: {}".format(predictions))

    classes = ["A", "C"]

    print("Evaluating test predictions...")
    evaluator = Evaluator()
    confusion = evaluator.confusion_matrix(predictions, y_test)

    print("Confusion matrix:")
    print(confusion)

    accuracy = evaluator.accuracy(confusion)
    print()
    print("Accuracy: {}".format(accuracy))

    (p, macro_p) = evaluator.precision(confusion)
    (r, macro_r) = evaluator.recall(confusion)
    (f, macro_f) = evaluator.f1_score(confusion)

    print()
    print("Class: Precision, Recall, F1")
    for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)):
        print("{}: {:.2f}, {:.2f}, {:.2f}".format(classes[i], p1, r1, f1))

    print()
    print("Macro-averaged Precision: {:.2f}".format(macro_p))
    print("Macro-averaged Recall: {:.2f}".format(macro_r))
Example #11
0
    def calc_stats(self, test_path, path_to_data, plt_title, prune,
                   pruneAggressively):
        #load dataset, atttribs, labels
        d_subset = ClassifierDataset()
        d_subset.initFromFile(path_to_data)
        attribs = d_subset.attrib
        labels = d_subset.labels

        ds_test = ClassifierDataset()
        ds_test.initFromFile(test_path)
        test_attribs = ds_test.attrib
        test_labels = ds_test.labels

        #train and predict
        print("TRAINING")
        tree = DecisionTreeClassifier()
        tree.train(attribs, labels)

        print("FINISHED TRAINING")
        if prune == True:
            print("PRUNING")
            validationDataset = ClassifierDataset()
            validationDataset.initFromFile(val_path)

            Prune(tree, validationDataset.attrib, validationDataset.labels,
                  pruneAggressively)

            print("FINISHED PRUNING")

        predictions = tree.predict(test_attribs)

        evaluator = Evaluator()
        c_matrix = evaluator.confusion_matrix(predictions, test_labels)
        print(c_matrix)

        a = ["A", "C", "E", "G", "O", "Q"]
        b = path_to_data[7:-4]
        if prune:
            if pruneAggressively:
                b = b + "_aggressively_pruned"
            else:
                b += "_pruned"

        else:
            b += "_not_pruned"

        plot_confusion_matrix(c_matrix, a, plt_title)
        print(" ")
        print("Accuracy: " + str(evaluator.accuracy(c_matrix)))
        print(" ")

        precision, macro_p = evaluator.precision(c_matrix)
        recall, macro_r = evaluator.recall(c_matrix)
        f1, macro_f1 = evaluator.f1_score(c_matrix)

        p = np.append(precision, macro_p)
        r = np.append(recall, macro_r)
        f1 = np.append(f1, macro_f1)

        performance_matrix = np.vstack((p, np.vstack((r, f1))))
        print(performance_matrix)
        plot_other_stats(performance_matrix, plt_title)
        '''
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1 Score: " + str(f1))'''

        print(" ")
        print("Macro avg recall:" + str(macro_r))
        print("Macro avg precision:" + str(macro_p))
        print("Macro avg f1:" + str(macro_f1))
        print(" ")
Example #12
0
    print("Loading the test set...")

    filename = "data/test.txt"
    x_test, y_test = classifier.load_data(filename)

    print("\nPredicting on test.txt data with 4 different trees")

    #Load the evaulator class
    eval = Evaluator()
    prune = Pruning()

    print("\nTree 2 unpruned")
    tree_3 = np.load('simple_tree.npy', allow_pickle=True).item()
    predictions = classifier.predict(x_test)
    confusion = eval.confusion_matrix(predictions, y_test)
    accuracy_3 = eval.accuracy(confusion)
    print("number of leaves:", prune.count_leaves(tree_3))
    print("Tree 2 unpruned Accuracy: " + str(np.round(accuracy_3 * 100, 2)))

    print("\nTree 2 pruned")
    tree_4 = np.load('simple_tree_pruned.npy', allow_pickle=True).item()
    predictions = classifier.predict(x_test, tree_4)
    confusion = eval.confusion_matrix(predictions, y_test)
    accuracy_4 = eval.accuracy(confusion)
    print("number of leaves:", prune.count_leaves(tree_4))
    print("Tree 2 pruned Accuracy: " + str(np.round(accuracy_4 * 100, 2)))

    print("Question 2.3")
    print("Printing the tree")
    classifier.print_tree(tree_3, "Method_2_UnPruned.pdf")
Example #13
0
    # Create and train a tree
    training_dataset = Dataset.load_from_file("train_full.txt")
    tree = DecisionTreeClassifier()
    tree = tree.train(training_dataset.attributes, training_dataset.labels)
    
    # Print tree before pruning
    tree.print()

    # Evaluate predictions before pruning
    evaluator = Evaluator()
    validation_dataset = Dataset.load_from_file("validation.txt")
    predictions_before = tree.predict(validation_dataset.attributes)
    confusion = evaluator.confusion_matrix(
        predictions_before, validation_dataset.labels)
    accuracy_before = evaluator.accuracy(confusion)
    print(f'Accuracy before: {accuracy_before}')

    # Perform pruning
    pruning(tree, validation_dataset)

    # Print tree after pruning
    tree.print()

    # Evaluate predictions after pruning
    predictions_after = tree.predict(validation_dataset.attributes)
    confusion = evaluator.confusion_matrix(
        predictions_after, validation_dataset.labels)
    accuracy_after = evaluator.accuracy(confusion)
    print(f'Accuracy after: {accuracy_after}')
Example #14
0
class DecisionTreePruner(object):
    def __init__(self, tree, valid):
        self.tree = tree
        self.valid = valid
        self.eval = Evaluator()

    def _prune_children(self, parent, node, direction):
        left_pred, right_pred = node.left.prediction, node.right.prediction
        left_count = parent.counts[left_pred]
        right_count = parent.counts[right_pred]

        new_pred = left_pred if left_count > right_count else right_pred

        if direction == 'L':
            parent.left = LeafNode(str(new_pred), prediction=new_pred)
        else:
            parent.right = LeafNode(str(new_pred), prediction=new_pred)

    def _unprune_children(self, parent, node, direction):
        if direction == 'L':
            parent.left = node
        else:
            parent.right = node

    def _get_accuracy(self):
        preds = self.tree.predict(valid.features)
        confusion = self.eval.confusion_matrix(preds, valid.labels)
        accuracy = self.eval.accuracy(confusion)

        return accuracy

    def _find_prunable_nodes(self):
        def prunable(node):
            return type(node.left) is LeafNode and type(node.right) is LeafNode

        queue = deque([(self.tree.root, 1)])
        prunable_nodes = []
        while queue:
            node, depth = queue.popleft()
            if type(node) is not LeafNode:
                for i, child in enumerate([node.left, node.right]):
                    if prunable(child):
                        direction = 'L' if i == 0 else 'R'
                        prunable_nodes.append((node, child, direction, depth))
                    else:
                        queue.append((child, depth + 1))
        return prunable_nodes

    def max_depth(self):
        queue = deque([(self.tree.root, 0)])
        max_depth = 0

        while queue:
            node, depth = queue.popleft()
            max_depth = max(max_depth, depth)
            if type(node) is not LeafNode:
                queue.append((node.left, depth + 1))
                queue.append((node.right, depth + 1))

        return max_depth

    def prune_tree(self):
        unpruned_accuracy = self._get_accuracy()
        print(unpruned_accuracy)
        improved = True

        while improved:
            max_pruned_accuracy = 0
            prunable_nodes = self._find_prunable_nodes()
            prunable_nodes.sort(key=lambda n: n[3])
            best = None

            for parent, node, direction, depth in prunable_nodes:
                self._prune_children(parent, node, direction)
                pruned_accuracy = self._get_accuracy()

                if pruned_accuracy > max_pruned_accuracy:
                    max_pruned_accuracy = pruned_accuracy
                    best = (parent, node, direction)

                self._unprune_children(parent, node, direction)

            improved = max_pruned_accuracy > unpruned_accuracy

            if improved:
                parent, best_node, direction = best
                print(max_pruned_accuracy)
                unpruned_accuracy = max_pruned_accuracy

                self._prune_children(parent, best_node, direction)
Example #15
0
def prune(tree: BinTree):
    vld_dataset = data_read("data/validation.txt")
    x_val, y_val = vld_dataset.shim_to_arrays()
    ev = Evaluator()
    for i in range(10):
        print(f"----prune attempt {i + 1}---")
        tree.prune(node=tree.root_node,
                   og_vld_feats=x_val,
                   og_vld_lbls=y_val,
                   dataset=vld_dataset,
                   ev=ev,
                   is_aggressive=False)


if __name__ == "__main__":
    train_file = "train_noisy"
    dataset = data_read(f"data/{train_file}.txt")
    tree = BinTree(dataset, f"tree_{train_file}.obj")
    test_dataset = data_read("data/test.txt")

    ev = Evaluator()
    x_test, y_test = test_dataset.shim_to_arrays()
    preds = [tree.predict(x) for x in x_test]
    matrix = ev.confusion_matrix(preds, y_test)
    print("test accuracy before pruning:", ev.accuracy(matrix))

    prune(tree)
    preds = [tree.predict(x) for x in x_test]
    matrix = ev.confusion_matrix(preds, y_test)
    print("test accuracy after pruning:", ev.accuracy(matrix))
Example #16
0
def combine_cross_validation(k, filename):
    """
    Performs cross validation on a dataset

    Parameters
    ----------
    k : int
        number of times dataset is split
    filename : string
        name of the file to load the dataset
    
    Returns
    -------
    list of ints
        containing the accuracies of each split
    int
        global error estimate
    """
    dataset_from_file = Dataset.load_from_file(filename)

    file_path = "./data/" + filename
    dataset = np.loadtxt(file_path, dtype=str, delimiter=',')
    np.random.shuffle(dataset)
    subsets = np.array_split(dataset, k)

    accuracies = []

    test = Dataset.load_from_file("test.txt")
    all_predictions = np.zeros((test.labels.shape[0], k), dtype=np.object)
    won_vote = np.zeros((test.labels.shape[0]), dtype=np.object)

    for i in range(k):
        train = np.delete(subsets, i, axis=0)
        train = np.concatenate(train)
        train_att = train[:, :-1].astype(int)
        train_labels = train[:, -1]

        test = subsets[i]
        test_att = test[:, :-1].astype(int)
        test_labels = test[:, -1]

        tree = DecisionTreeClassifier()
        tree = tree.train(train_att, train_labels)
        test = Dataset.load_from_file("test.txt")
        prediction = tree.predict(test.attributes)

        #Put all the predictions into a numpy array, to vote on most freq label
        for index in range(len(prediction)):
            all_predictions[index][i] = prediction[index]

        #Calculate the accuracy of each model and put into a list
        evaluator = Evaluator()
        confusion = evaluator.confusion_matrix(prediction, test.labels)
        a = evaluator.accuracy(confusion)
        accuracies.append(a)
        print(accuracies)

    global_error_estimate = np.mean(accuracies)
    np.set_printoptions(formatter={'float': '{: 0.4f}'.format})

    #Create predictions with most frequent label from all k models
    for index, prediction in enumerate(all_predictions):

        #Ensure there are only labels in the array
        prediction = np.delete(prediction, np.argwhere(prediction == 0))

        #Get the label with the highest frequency
        unique, position = np.unique(prediction, return_inverse=True)
        count = np.bincount(position)
        pos_with_max_count = count.argmax()
        winning_label = unique[pos_with_max_count]
        won_vote[index] = winning_label

    #Calculate the accucacy of the combined model
    print(f'WINNERS: {won_vote}')

    evaluator_w = Evaluator()
    confusion_w = evaluator_w.confusion_matrix(won_vote, test.labels)
    a_w = evaluator_w.accuracy(confusion_w)

    return a_w