Esempio n. 1
0
    def calculate_best_pruned_tree(self, original_tree, trees, x_val, y_val):

        eval = Evaluator()
        classifier = DecisionTreeClassifier()
        classifier.is_trained = True
        original_predictions = classifier.predict(x_val, original_tree)
        original_error = self.get_apperent_error_rate(original_predictions,
                                                      y_val)

        stored_j = 0
        initial_alpha = 0
        previous_diff = 0
        previous_alpha = 0

        #go through each tree and compute the ratio of caculated error (right/total)
        for j in range(1, len(trees)):

            predictions = classifier.predict(x_val, trees[j])
            error = self.get_apperent_error_rate(predictions, y_val)
            original_number_leaves = self.count_leaves(original_tree)
            number_of_leaves = self.count_leaves(trees[j])
            alpha = (original_error - error) / (original_number_leaves -
                                                number_of_leaves)
            diff = initial_alpha - alpha
            if diff < previous_diff:
                stored_j = j
                previous_alpha = alpha

        return trees[stored_j], previous_alpha
    def __crossValidation(self, subsets):
        models = []

        for i in range(len(subsets)):
            # Separate one fold for validation
            validationSet = subsets[i]

            # Use the remaining folds for training
            features, labels = list(zip(*(subsets[:i] + subsets[(i + 1):])))
            features = reduce(lambda x, y: np.append(x, y, axis=0), features)
            labels = reduce(lambda x, y: np.append(x, y, axis=0), labels)
            classifier = DecisionTreeClassifier()
            classifier.train(features, labels)

            # Evaluate the classifier on the validationSet
            validationFeatures = validationSet[0]
            validationLabels = validationSet[1]
            predictions = classifier.predict(validationFeatures)
            evaluator = Evaluator()
            confusion = evaluator.confusion_matrix(predictions,
                                                   validationLabels)
            accuracy = evaluator.accuracy(confusion)
            macroP = evaluator.precision(confusion)[1]
            macroR = evaluator.recall(confusion)[1]
            macroF = evaluator.f1_score(confusion)[1]

            # Add the model to the candidate models
            models.append((classifier, [accuracy, macroP, macroR, macroF]))

        # Return the best model
        return self.getBestModel(models)
Esempio n. 3
0
def cross_validation(k, filename):
    """
    Performs cross validation on a dataset

    Parameters
    ----------
    k : int
        number of times dataset is split
    filename : string
        name of the file to load the dataset
    
    Returns
    -------
    list of ints
        containing the accuracies of each split
    int
        global error estimate
    """
    file_path = "./data/" + filename
    dataset = np.loadtxt(file_path, dtype=str, delimiter=',')
    np.random.shuffle(dataset)
    subsets = np.array_split(dataset, k)

    accuracies = []

    for i in range(k):
        train = np.delete(subsets, i, axis=0)
        train = np.concatenate(train)
        train_att = train[:, :-1].astype(int)
        train_labels = train[:, -1]

        test = subsets[i]
        test_att = test[:, :-1].astype(int)
        test_labels = test[:, -1]

        tree = DecisionTreeClassifier()
        tree = tree.train(train_att, train_labels)
        prediction = tree.predict(test_att)

        evaluator = Evaluator()
        confusion = evaluator.confusion_matrix(prediction, test_labels)
        a = evaluator.accuracy(confusion)
        accuracies.append(a)

    global_error_estimate = np.mean(accuracies)
    np.set_printoptions(formatter={'float': '{: 0.4f}'.format})

    return accuracies, global_error_estimate
Esempio n. 4
0
def cross_validation(x, y, k):

    xpart, ypart = data_split(x, y, k)
    accuracy = np.zeros(k)
    classifiers = np.empty(k, dtype=object)

    for i in range(k):

        # split data correctly
        xval = xpart[i]
        yval = ypart[i]
        xtrain = np.delete(xpart, i, 0).reshape((k - 1) * xval.shape[0],
                                                xval.shape[1])
        ytrain = np.delete(ypart, i, 0).reshape((k - 1) * xval.shape[0], 1)

        # train on training slice
        classifiers[i] = DecisionTreeClassifier()
        classifiers[i] = classifiers[i].train(xtrain, ytrain)

        #predict for test class
        predictions = classifiers[i].predict(xval)

        # validate using statistics
        eval = Evaluator()
        confusion = eval.confusion_matrix(predictions, yval)
        accuracy[i] = eval.accuracy(confusion)

    return accuracy, classifiers
Esempio n. 5
0
def main():
    print("Loading the training dataset...")
    x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2],
                  [5, 2, 6]])

    y = np.array(["A", "A", "A", "C", "C", "C"])

    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier = classifier.train(x, y)

    print("Loading the test set...")

    x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]])

    y_test = np.array(["A", "A", "C", "C"])

    predictions = classifier.predict(x_test)
    print("Predictions: {}".format(predictions))

    classes = ["A", "C"]

    print("Evaluating test predictions...")
    evaluator = Evaluator()
    confusion = evaluator.confusion_matrix(predictions, y_test)

    print("Confusion matrix:")
    print(confusion)

    accuracy = evaluator.accuracy(confusion)
    print()
    print("Accuracy: {}".format(accuracy))

    (p, macro_p) = evaluator.precision(confusion)
    (r, macro_r) = evaluator.recall(confusion)
    (f, macro_f) = evaluator.f1_score(confusion)

    print()
    print("Class: Precision, Recall, F1")
    for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)):
        print("{}: {:.2f}, {:.2f}, {:.2f}".format(classes[i], p1, r1, f1))

    print()
    print("Macro-averaged Precision: {:.2f}".format(macro_p))
    print("Macro-averaged Recall: {:.2f}".format(macro_r))
    print("Macro-averaged F1: {:.2f}".format(macro_f))
    def decision_tree_classification (self):
        data_preprocessor = DataPreProcessor(self.df_variable, self.pd_variable) # define preprocess.DataPreProcessor instance
        data_preprocessor_for_task = DataPreProcessorForTask(data_preprocessor) # perform preprocessing using preprocess.DataPreProcessorForTask using preprocess.DataPreProcessor instance
        data_preprocessor_for_task.preprocess_for_decision_tree_classification() # perform preprocessing for DT classification
        classification_pd_variable = data_preprocessor.pd_variable
        classification_df_variable = data_preprocessor.df_variable
        
        decision_tree_classification = DecisionTreeClassifier(classification_df_variable) # define instance of classification.DecisionTreeClassifier on preproccessed DF
        decision_tree_classification.classify() # run classification
        accuracy = decision_tree_classification.accuracy # classifier retrieve its accuracy result 
        classification_report = decision_tree_classification.classification_report # classifier retrieve its report 
        confusion_matrix = decision_tree_classification.confusion_matrix # classifier retrieve its confusion matrix 
        
        messagebox.showinfo("accuracy", accuracy) # show accuracy of classifier on message box
        messagebox.showinfo("classification_report", classification_report) # show accuracy of classifier on classification report

        print (accuracy)
        print (classification_report)
        print (confusion_matrix)
Esempio n. 7
0
    def prune_tree_reduced_error(self, tree, x_val, y_val):
        """
        Function to accept prunes which increase the tree's accuracy, otherwise
        ignore
        Args:
            tree (dict) - tree to be pruned
            x_val (2D array) - 2D array of attributes of validation set where
                each row is a differnt sample and each column is a differnt
                attribute
            y_val (1D array) - 1D array of correct labels for x_val validation
                data
        Output:
            tree (dict or str) - tree pruned such that any additional pruning
                would lower predictive accuracy on validation set.
        """
        classifier = DecisionTreeClassifier()
        classifier.is_trained = True
        predictions = classifier.predict(x_val)
        eval = Evaluator()
        confusion = eval.confusion_matrix(predictions, y_val)
        root_accuracy = eval.accuracy(confusion)
        print("Results on Validation set")
        print("Original Accuracy: ", root_accuracy)

        is_pruned = True
        while (is_pruned and isinstance(tree, dict)):
            #make copy of tree then attempt to prune copy
            tree_copy = copy.deepcopy(tree)
            (is_pruned, tree_copy, tree) = self.prune(tree_copy, tree)
            if is_pruned:
                #compare accuracy of pruned tree to original
                new_predictions = classifier.predict(x_val, tree_copy)
                new_confusion = eval.confusion_matrix(new_predictions, y_val)
                new_accuracy = eval.accuracy(new_confusion)
                if new_accuracy >= root_accuracy:
                    #if greater or equal accuracy make tree = copy
                    root_accuracy = new_accuracy
                    tree = copy.deepcopy(tree_copy)

        print("New Accuracy: ", root_accuracy)
        return tree
Esempio n. 8
0
def test_DecisionTreeClassifier(dataset_filename: str = "toy.txt",
                                should_load_file=False):
    # train
    extless_filename = dataset_filename[:-4]
    start = time.time()
    saved_tree_file = None
    if should_load_file:
        saved_tree_file = "tree_" + extless_filename + ".obj"
    cl = DecisionTreeClassifier(saved_tree_file=saved_tree_file)
    dataset = data_read("data/" + dataset_filename)
    unique_lbls = np.unique([e.label for e in dataset.entries])
    x, y = dataset.shim_to_arrays()
    cl.train(x, y)
    cl.tree.save_tree("tree_" + extless_filename + ".obj")
    visualize_tree(cl.tree,
                   save_filename=f"visualize_tree_{extless_filename}.txt",
                   max_depth=8)
    duration = time.time() - start
    print("duration: ", duration)

    # predict
    test_dataset = data_read("data/test.txt")
    x_test, y_test = test_dataset.shim_to_arrays()
    preds = cl.predict(x_test)
    # preds = [random.choice('ACEGOQ')
    #  for _ in range(len(y_test))]  # testing random
    # evaluate
    ev = Evaluator()
    matrix = ev.confusion_matrix(preds, y_test, unique_lbls)
    print("real accuracy: ", accuracy_score(y_test, preds))
    print("\nour calc accuracy: ", str.format('{0:.15f}', ev.accuracy(matrix)))
    print("\n precision:", precision_score(y_test, preds, average="macro"))
    print("\n our precision: ", ev.precision(matrix))
    print("\nreal recall: ", recall_score(y_test, preds, average="macro"))
    print("\n our recall: ", ev.recall(matrix))
    print("\n f1_score", f1_score(y_test, preds, average="macro"))
    print("\n f1_score: ", ev.f1_score(matrix))
    print(matrix)
Esempio n. 9
0
import numpy as np

from classification import DecisionTreeClassifier
from data.Dataset import Dataset
from eval import Evaluator

if __name__ == "__main__":
    print("Loading the training dataset...")
    dataset = Dataset()
    dataset.readData("data/train_full.txt")
    x = dataset.features
    y = dataset.labels

    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier = classifier.train(x, y)

    classifier.print()

    print("\n")

    print("Tree visualisation graphically")
    print("\n")
    print("\n")
    print("\n")
    classifier.printImageTree()
    print("\n")
    print("\n")
    print("\n")
    print("\n")
Esempio n. 10
0
##############################################################################

import numpy as np

from classification import DecisionTreeClassifier
from eval import Evaluator

if __name__ == "__main__":
    print("Loading the training dataset...")
    x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2],
                  [5, 2, 6]])

    y = np.array(["A", "A", "A", "C", "C", "C"])

    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier = classifier.train(x, y)

    print("Loading the test set...")

    x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]])

    y_test = np.array(["A", "A", "C", "C"])

    predictions = classifier.predict(x_test)
    print("Predictions: {}".format(predictions))

    classes = ["A", "C"]

    print("Evaluating test predictions...")
    evaluator = Evaluator()
Esempio n. 11
0
    def calc_stats(self, test_path, path_to_data, plt_title, prune,
                   pruneAggressively):
        #load dataset, atttribs, labels
        d_subset = ClassifierDataset()
        d_subset.initFromFile(path_to_data)
        attribs = d_subset.attrib
        labels = d_subset.labels

        ds_test = ClassifierDataset()
        ds_test.initFromFile(test_path)
        test_attribs = ds_test.attrib
        test_labels = ds_test.labels

        #train and predict
        print("TRAINING")
        tree = DecisionTreeClassifier()
        tree.train(attribs, labels)

        print("FINISHED TRAINING")
        if prune == True:
            print("PRUNING")
            validationDataset = ClassifierDataset()
            validationDataset.initFromFile(val_path)

            Prune(tree, validationDataset.attrib, validationDataset.labels,
                  pruneAggressively)

            print("FINISHED PRUNING")

        predictions = tree.predict(test_attribs)

        evaluator = Evaluator()
        c_matrix = evaluator.confusion_matrix(predictions, test_labels)
        print(c_matrix)

        a = ["A", "C", "E", "G", "O", "Q"]
        b = path_to_data[7:-4]
        if prune:
            if pruneAggressively:
                b = b + "_aggressively_pruned"
            else:
                b += "_pruned"

        else:
            b += "_not_pruned"

        plot_confusion_matrix(c_matrix, a, plt_title)
        print(" ")
        print("Accuracy: " + str(evaluator.accuracy(c_matrix)))
        print(" ")

        precision, macro_p = evaluator.precision(c_matrix)
        recall, macro_r = evaluator.recall(c_matrix)
        f1, macro_f1 = evaluator.f1_score(c_matrix)

        p = np.append(precision, macro_p)
        r = np.append(recall, macro_r)
        f1 = np.append(f1, macro_f1)

        performance_matrix = np.vstack((p, np.vstack((r, f1))))
        print(performance_matrix)
        plot_other_stats(performance_matrix, plt_title)
        '''
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1 Score: " + str(f1))'''

        print(" ")
        print("Macro avg recall:" + str(macro_r))
        print("Macro avg precision:" + str(macro_p))
        print("Macro avg f1:" + str(macro_f1))
        print(" ")
if __name__ == "__main__":
    print("Loading the training dataset...")
    x = np.array([
            [5,7,1],
            [4,6,2],
            [4,6,3], 
            [1,3,1], 
            [2,1,2], 
            [5,2,6]
        ])
    
    y = np.array(["A", "A", "A", "C", "C", "C"])
    
    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier.fit(x, y)

    print("Loading the test set...")
    
    x_test = np.array([
            [1,6,3], 
            [0,5,5], 
            [1,5,0], 
            [2,4,2]
        ])
    
    y_test = np.array(["A", "A", "C", "C"])
    
    print("Making predictions on the test set...")
    predictions = classifier.predict(x_test)
Esempio n. 13
0
        np.set_printoptions(formatter={'float': '{: 0.4f}'.format})
        macro_f = round(macro_f, 4)

        return (f, macro_f)


# Example usage
if __name__ == "__main__":

    # Create an evaluator and load the datasets
    evaluator = Evaluator()
    training_dataset = Dataset.load_from_file("train_full.txt")
    test_dataset = Dataset.load_from_file("test.txt")

    # Create a tree, train it and test it on the datasets
    tree = DecisionTreeClassifier()
    trained_tree = tree.train(training_dataset.attributes,
                              training_dataset.labels)
    prediction = tree.predict(test_dataset.attributes)

    # Compute the confusion matrix
    confusion = evaluator.confusion_matrix(prediction, test_dataset.labels)
    print(f'Confusion matrix: {confusion}')

    # Compute the accuracy
    a = evaluator.accuracy(confusion)
    print(f'Accuracy: {a}')

    # Compute the precision
    p, macro_p = evaluator.precision(confusion)
    print(f'Precision: {p}')
Esempio n. 14
0
    print("accuracy", accuracy)
    print("precision", precision)
    print("recall", recall)
    print("f1", f1)

    return


if __name__ == "__main__":

    #QUESTION 1
    print("Question 1")
    print("Loading the data")

    filename = "data/train_full.txt"
    classifier = DecisionTreeClassifier()
    x, y = classifier.load_data(filename)

    #QUESTION 2
    print("Question 2")
    print("Training the tree with two different methods")

    print("Training the decision tree...")
    classifier = classifier.train(x, y)

    print("Loading the test set...")

    filename = "data/test.txt"
    x_test, y_test = classifier.load_data(filename)

    print("\nPredicting on test.txt data with 4 different trees")
from classification import DecisionTreeClassifier

x_train, y_train = reader.read_from_csv("data/train_full.txt")
x_test, y_test = reader.read_from_csv("data/test.txt")

k = 10
seed = 42
folds = metrics.split_k_fold(x_train, k, seed)
trees = []  # Holds all k trees trained using train sets selected by folds

# Train trees

for fold in range(k):

    # Create a new tree, train it and store it in trees collection
    new_tree = DecisionTreeClassifier()
    new_tree_accuracy = metrics.train_and_eval_kth(new_tree, folds, x_train,
                                                   y_train, fold)
    trees.append(new_tree)

    # Print out the result for this fold
    print("tree accuracy (validation)", fold, " = ", new_tree_accuracy)
    print("tree accuracy (test)", fold, " = ",
          metrics.accuracy(new_tree.predict(x_test), y_test))

# Find predictions for each folds

predictions = []
for tree in trees:
    prediction = tree.predict(x_test)
    predictions.append(prediction)
import numpy as np
import reader
import metrics
from classification import DecisionTreeClassifier

# Load data

x, y = reader.read_from_csv("data/train_full.txt")
x_val, y_val = reader.read_from_csv("data/validation.txt")
x_test, y_test = reader.read_from_csv("data/test.txt")

# 1. Train grid search on prepruning

tree_preprune = DecisionTreeClassifier()
options = {"max_tree_depth": [13, 15, 17], "min_sample_size": [2, 3, 4]}
print(metrics.grid_search(tree_preprune, x, y, x_val, y_val, options))
acc1 = metrics.accuracy(tree_preprune.predict(x_test), y_test)

# 2. Train existing classifier on postpruning (call post prune on 1)

tree_preprune.prune(x_val, y_val)
acc2 = metrics.accuracy(tree_preprune.predict(x_test), y_test)

# 3. Train new classifier on postpruning

tree_postprune = DecisionTreeClassifier()
tree_postprune.fit(x, y)
tree_postprune.prune(x_val, y_val)
acc3 = metrics.accuracy(tree_postprune.predict(x_test), y_test)

print("")
Esempio n. 17
0
def combine_cross_validation(k, filename):
    """
    Performs cross validation on a dataset

    Parameters
    ----------
    k : int
        number of times dataset is split
    filename : string
        name of the file to load the dataset
    
    Returns
    -------
    list of ints
        containing the accuracies of each split
    int
        global error estimate
    """
    dataset_from_file = Dataset.load_from_file(filename)

    file_path = "./data/" + filename
    dataset = np.loadtxt(file_path, dtype=str, delimiter=',')
    np.random.shuffle(dataset)
    subsets = np.array_split(dataset, k)

    accuracies = []

    test = Dataset.load_from_file("test.txt")
    all_predictions = np.zeros((test.labels.shape[0], k), dtype=np.object)
    won_vote = np.zeros((test.labels.shape[0]), dtype=np.object)

    for i in range(k):
        train = np.delete(subsets, i, axis=0)
        train = np.concatenate(train)
        train_att = train[:, :-1].astype(int)
        train_labels = train[:, -1]

        test = subsets[i]
        test_att = test[:, :-1].astype(int)
        test_labels = test[:, -1]

        tree = DecisionTreeClassifier()
        tree = tree.train(train_att, train_labels)
        test = Dataset.load_from_file("test.txt")
        prediction = tree.predict(test.attributes)

        #Put all the predictions into a numpy array, to vote on most freq label
        for index in range(len(prediction)):
            all_predictions[index][i] = prediction[index]

        #Calculate the accuracy of each model and put into a list
        evaluator = Evaluator()
        confusion = evaluator.confusion_matrix(prediction, test.labels)
        a = evaluator.accuracy(confusion)
        accuracies.append(a)
        print(accuracies)

    global_error_estimate = np.mean(accuracies)
    np.set_printoptions(formatter={'float': '{: 0.4f}'.format})

    #Create predictions with most frequent label from all k models
    for index, prediction in enumerate(all_predictions):

        #Ensure there are only labels in the array
        prediction = np.delete(prediction, np.argwhere(prediction == 0))

        #Get the label with the highest frequency
        unique, position = np.unique(prediction, return_inverse=True)
        count = np.bincount(position)
        pos_with_max_count = count.argmax()
        winning_label = unique[pos_with_max_count]
        won_vote[index] = winning_label

    #Calculate the accucacy of the combined model
    print(f'WINNERS: {won_vote}')

    evaluator_w = Evaluator()
    confusion_w = evaluator_w.confusion_matrix(won_vote, test.labels)
    a_w = evaluator_w.accuracy(confusion_w)

    return a_w
Esempio n. 18
0
import reader
import numpy as np
import metrics
from classification import DecisionTreeClassifier

x_train, y_train = reader.read_from_csv("data/train_full.txt")
x_test, y_test = reader.read_from_csv("data/test.txt")

# Trees initialisation
tree_full = DecisionTreeClassifier()

mean_acc, std_dev_acc = \
    metrics.k_cross_val(tree_full, x_train, y_train, k=10, seed=42)

# Q3.2
print('Q3.2')
print(mean_acc, std_dev_acc)

y_pred = tree_full.predict(x_test)

test_accuracy = metrics.accuracy(y_pred, y_test)
test_precision = metrics.precision(y_pred, y_test)
test_recall = metrics.recall(y_pred, y_test)
test_f_score = metrics.f_score(y_pred, y_test)
labels, confusion_matrix = metrics.conf_matrix(y_pred, y_test)

# Q3.3
print('Q3.3')
print(labels)
print(confusion_matrix)
print('Test acc:', test_accuracy)
Esempio n. 19
0
    pruner.prune_tree()

    preds = pruner.tree.predict(test.features)
    confusion = eval.confusion_matrix(preds, test.labels)
    new_accuracy = eval.accuracy(confusion)

    new_depth = pruner.max_depth()

    return old_accuracy, new_accuracy, old_depth, new_depth


if __name__ == "__main__":
    headers = [
        "x-box", "y-box", "width", "high", "onpix", "x-bar", "y-bar", "x2bar",
        "y2bar", "xybar", "x2ybr", "xy2br", "x-ege", "xegvy", "y-ege", "yegvx"
    ]

    valid = Dataset("data/validation.txt")
    test = Dataset("data/test.txt")
    eval = Evaluator()

    full_model = DecisionTreeClassifier(headers)
    full_model.deserialise_model("data/model_full.pickle")

    noisy_model = DecisionTreeClassifier(headers)
    noisy_model.deserialise_model("data/model_noisy.pickle")

    print(setup(full_model, valid, test))
    print()
    print(setup(noisy_model, valid, test))
Esempio n. 20
0
    print()

    print()
    print("Macro-averaged Precision: {:.2f}".format(macro_p))
    print("Macro-averaged Recall: {:.2f}".format(macro_r))
    print("Macro-averaged F1: {:.2f}".format(macro_f))


if __name__ == "__main__":
    print("Loading the datasets...")
    trainingData = dataReader.parseFile("data/train_full.txt")
    validationData = dataReader.parseFile("data/validation.txt")
    testData = dataReader.parseFile("data/test.txt")

    print("Training the decision tree...")
    classifier = DecisionTreeClassifier()
    classifier = classifier.train(trainingData[0], trainingData[1])

    predictions = classifier.predict(testData[0])
    print("Pre prunning predictions: {}".format(predictions))

    print("Evaluating test predictions...")
    evaluator = Evaluator()
    confusion = evaluator.confusion_matrix(predictions, testData[1])
    printMetric(confusion)

    print("Pruning the decision tree...")
    classifier.prune(validationData)

    predictions = classifier.predict(testData[0])
    print("Post prunning predictions: {}".format(predictions))
Esempio n. 21
0
from classification import DecisionTreeClassifier
from dataset import ClassifierDataset
from prune import Prune
import matplotlib.pyplot as plt

pathToSimple1 = './data/simple1.txt'
pathToSimple2 = './data/simple2.txt'
pathToTest = './data/test.txt'
pathToToy = './data/toy.txt'
pathToToy2 = './data/toy2.txt'
pathToFull = './data/train_full.txt'
pathToNoisy = './data/train_noisy.txt'
pathToSub = './data/train_sub.txt'
pathToValid = './data/validation.txt'
pathToExample = './data/example.txt'

dataset = ClassifierDataset()
dataset.initFromFile(pathToFull)  # CHANGE PATH HERE

dtc = DecisionTreeClassifier()
dtc.train(dataset.attrib, dataset.labels)

validationDataset = ClassifierDataset()
validationDataset.initFromFile(pathToValid)

# Uncomment the below line to Prune
# Prune(dtc, validationDataset.attrib, validationDataset.labels, True)

# first arg: Decision Tree Classifier object; second arg: max tree depth, third arg: compact mode
# fourth arg: filename, fifth arg: format
tv = TreeVisualiser(dtc, None, True, 'full_3_prune', 'pdf')
    print(precision)
    print(f"Average Precision: {avg_prec}")
    print("Recall:")
    print(recall)
    print(f"Average Recall: {avg_rec}")
    print("F1_score:")
    print(f1_score)
    print(f"Average F1 Score: {avg_f1}")


x_train, y_train = reader.read_from_csv("data/train_full.txt")
x_sub, y_sub = reader.read_from_csv("data/train_sub.txt")
x_noisy, y_noisy = reader.read_from_csv("data/train_noisy.txt")
x_test, y_test = reader.read_from_csv("data/test.txt")

# Trees initialisation
tree_full = DecisionTreeClassifier()
tree_sub = DecisionTreeClassifier()
tree_noisy = DecisionTreeClassifier()

# Fitting trees
tree_full.fit(x_train, y_train)
tree_sub.fit(x_sub, y_sub)
tree_noisy.fit(x_noisy, y_noisy)

# Evaluation
eval_classifier(tree_full, x_test, y_test)
eval_classifier(tree_sub, x_test, y_test)
eval_classifier(tree_noisy, x_test, y_test)

Esempio n. 23
0
            confusion = evaluator.confusion_matrix(predictions_after, annotation)
            accuracy_after = evaluator.accuracy(confusion)

            # Restore node if accuracy dropped
            if (accuracy_after < accuracy_before):
                node.left.label = temp_label_left
                node.right.label = temp_label_right
                node.label = None
                node.rule = temp_rule

# Example usage
if __name__ == "__main__":

    # Create and train a tree
    training_dataset = Dataset.load_from_file("train_full.txt")
    tree = DecisionTreeClassifier()
    tree = tree.train(training_dataset.attributes, training_dataset.labels)
    
    # Print tree before pruning
    tree.print()

    # Evaluate predictions before pruning
    evaluator = Evaluator()
    validation_dataset = Dataset.load_from_file("validation.txt")
    predictions_before = tree.predict(validation_dataset.attributes)
    confusion = evaluator.confusion_matrix(
        predictions_before, validation_dataset.labels)
    accuracy_before = evaluator.accuracy(confusion)
    print(f'Accuracy before: {accuracy_before}')

    # Perform pruning