def calculate_best_pruned_tree(self, original_tree, trees, x_val, y_val): eval = Evaluator() classifier = DecisionTreeClassifier() classifier.is_trained = True original_predictions = classifier.predict(x_val, original_tree) original_error = self.get_apperent_error_rate(original_predictions, y_val) stored_j = 0 initial_alpha = 0 previous_diff = 0 previous_alpha = 0 #go through each tree and compute the ratio of caculated error (right/total) for j in range(1, len(trees)): predictions = classifier.predict(x_val, trees[j]) error = self.get_apperent_error_rate(predictions, y_val) original_number_leaves = self.count_leaves(original_tree) number_of_leaves = self.count_leaves(trees[j]) alpha = (original_error - error) / (original_number_leaves - number_of_leaves) diff = initial_alpha - alpha if diff < previous_diff: stored_j = j previous_alpha = alpha return trees[stored_j], previous_alpha
def __crossValidation(self, subsets): models = [] for i in range(len(subsets)): # Separate one fold for validation validationSet = subsets[i] # Use the remaining folds for training features, labels = list(zip(*(subsets[:i] + subsets[(i + 1):]))) features = reduce(lambda x, y: np.append(x, y, axis=0), features) labels = reduce(lambda x, y: np.append(x, y, axis=0), labels) classifier = DecisionTreeClassifier() classifier.train(features, labels) # Evaluate the classifier on the validationSet validationFeatures = validationSet[0] validationLabels = validationSet[1] predictions = classifier.predict(validationFeatures) evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, validationLabels) accuracy = evaluator.accuracy(confusion) macroP = evaluator.precision(confusion)[1] macroR = evaluator.recall(confusion)[1] macroF = evaluator.f1_score(confusion)[1] # Add the model to the candidate models models.append((classifier, [accuracy, macroP, macroR, macroF])) # Return the best model return self.getBestModel(models)
def cross_validation(k, filename): """ Performs cross validation on a dataset Parameters ---------- k : int number of times dataset is split filename : string name of the file to load the dataset Returns ------- list of ints containing the accuracies of each split int global error estimate """ file_path = "./data/" + filename dataset = np.loadtxt(file_path, dtype=str, delimiter=',') np.random.shuffle(dataset) subsets = np.array_split(dataset, k) accuracies = [] for i in range(k): train = np.delete(subsets, i, axis=0) train = np.concatenate(train) train_att = train[:, :-1].astype(int) train_labels = train[:, -1] test = subsets[i] test_att = test[:, :-1].astype(int) test_labels = test[:, -1] tree = DecisionTreeClassifier() tree = tree.train(train_att, train_labels) prediction = tree.predict(test_att) evaluator = Evaluator() confusion = evaluator.confusion_matrix(prediction, test_labels) a = evaluator.accuracy(confusion) accuracies.append(a) global_error_estimate = np.mean(accuracies) np.set_printoptions(formatter={'float': '{: 0.4f}'.format}) return accuracies, global_error_estimate
def cross_validation(x, y, k): xpart, ypart = data_split(x, y, k) accuracy = np.zeros(k) classifiers = np.empty(k, dtype=object) for i in range(k): # split data correctly xval = xpart[i] yval = ypart[i] xtrain = np.delete(xpart, i, 0).reshape((k - 1) * xval.shape[0], xval.shape[1]) ytrain = np.delete(ypart, i, 0).reshape((k - 1) * xval.shape[0], 1) # train on training slice classifiers[i] = DecisionTreeClassifier() classifiers[i] = classifiers[i].train(xtrain, ytrain) #predict for test class predictions = classifiers[i].predict(xval) # validate using statistics eval = Evaluator() confusion = eval.confusion_matrix(predictions, yval) accuracy[i] = eval.accuracy(confusion) return accuracy, classifiers
def main(): print("Loading the training dataset...") x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2], [5, 2, 6]]) y = np.array(["A", "A", "A", "C", "C", "C"]) print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(x, y) print("Loading the test set...") x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]]) y_test = np.array(["A", "A", "C", "C"]) predictions = classifier.predict(x_test) print("Predictions: {}".format(predictions)) classes = ["A", "C"] print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, y_test) print("Confusion matrix:") print(confusion) accuracy = evaluator.accuracy(confusion) print() print("Accuracy: {}".format(accuracy)) (p, macro_p) = evaluator.precision(confusion) (r, macro_r) = evaluator.recall(confusion) (f, macro_f) = evaluator.f1_score(confusion) print() print("Class: Precision, Recall, F1") for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)): print("{}: {:.2f}, {:.2f}, {:.2f}".format(classes[i], p1, r1, f1)) print() print("Macro-averaged Precision: {:.2f}".format(macro_p)) print("Macro-averaged Recall: {:.2f}".format(macro_r)) print("Macro-averaged F1: {:.2f}".format(macro_f))
def decision_tree_classification (self): data_preprocessor = DataPreProcessor(self.df_variable, self.pd_variable) # define preprocess.DataPreProcessor instance data_preprocessor_for_task = DataPreProcessorForTask(data_preprocessor) # perform preprocessing using preprocess.DataPreProcessorForTask using preprocess.DataPreProcessor instance data_preprocessor_for_task.preprocess_for_decision_tree_classification() # perform preprocessing for DT classification classification_pd_variable = data_preprocessor.pd_variable classification_df_variable = data_preprocessor.df_variable decision_tree_classification = DecisionTreeClassifier(classification_df_variable) # define instance of classification.DecisionTreeClassifier on preproccessed DF decision_tree_classification.classify() # run classification accuracy = decision_tree_classification.accuracy # classifier retrieve its accuracy result classification_report = decision_tree_classification.classification_report # classifier retrieve its report confusion_matrix = decision_tree_classification.confusion_matrix # classifier retrieve its confusion matrix messagebox.showinfo("accuracy", accuracy) # show accuracy of classifier on message box messagebox.showinfo("classification_report", classification_report) # show accuracy of classifier on classification report print (accuracy) print (classification_report) print (confusion_matrix)
def prune_tree_reduced_error(self, tree, x_val, y_val): """ Function to accept prunes which increase the tree's accuracy, otherwise ignore Args: tree (dict) - tree to be pruned x_val (2D array) - 2D array of attributes of validation set where each row is a differnt sample and each column is a differnt attribute y_val (1D array) - 1D array of correct labels for x_val validation data Output: tree (dict or str) - tree pruned such that any additional pruning would lower predictive accuracy on validation set. """ classifier = DecisionTreeClassifier() classifier.is_trained = True predictions = classifier.predict(x_val) eval = Evaluator() confusion = eval.confusion_matrix(predictions, y_val) root_accuracy = eval.accuracy(confusion) print("Results on Validation set") print("Original Accuracy: ", root_accuracy) is_pruned = True while (is_pruned and isinstance(tree, dict)): #make copy of tree then attempt to prune copy tree_copy = copy.deepcopy(tree) (is_pruned, tree_copy, tree) = self.prune(tree_copy, tree) if is_pruned: #compare accuracy of pruned tree to original new_predictions = classifier.predict(x_val, tree_copy) new_confusion = eval.confusion_matrix(new_predictions, y_val) new_accuracy = eval.accuracy(new_confusion) if new_accuracy >= root_accuracy: #if greater or equal accuracy make tree = copy root_accuracy = new_accuracy tree = copy.deepcopy(tree_copy) print("New Accuracy: ", root_accuracy) return tree
def test_DecisionTreeClassifier(dataset_filename: str = "toy.txt", should_load_file=False): # train extless_filename = dataset_filename[:-4] start = time.time() saved_tree_file = None if should_load_file: saved_tree_file = "tree_" + extless_filename + ".obj" cl = DecisionTreeClassifier(saved_tree_file=saved_tree_file) dataset = data_read("data/" + dataset_filename) unique_lbls = np.unique([e.label for e in dataset.entries]) x, y = dataset.shim_to_arrays() cl.train(x, y) cl.tree.save_tree("tree_" + extless_filename + ".obj") visualize_tree(cl.tree, save_filename=f"visualize_tree_{extless_filename}.txt", max_depth=8) duration = time.time() - start print("duration: ", duration) # predict test_dataset = data_read("data/test.txt") x_test, y_test = test_dataset.shim_to_arrays() preds = cl.predict(x_test) # preds = [random.choice('ACEGOQ') # for _ in range(len(y_test))] # testing random # evaluate ev = Evaluator() matrix = ev.confusion_matrix(preds, y_test, unique_lbls) print("real accuracy: ", accuracy_score(y_test, preds)) print("\nour calc accuracy: ", str.format('{0:.15f}', ev.accuracy(matrix))) print("\n precision:", precision_score(y_test, preds, average="macro")) print("\n our precision: ", ev.precision(matrix)) print("\nreal recall: ", recall_score(y_test, preds, average="macro")) print("\n our recall: ", ev.recall(matrix)) print("\n f1_score", f1_score(y_test, preds, average="macro")) print("\n f1_score: ", ev.f1_score(matrix)) print(matrix)
import numpy as np from classification import DecisionTreeClassifier from data.Dataset import Dataset from eval import Evaluator if __name__ == "__main__": print("Loading the training dataset...") dataset = Dataset() dataset.readData("data/train_full.txt") x = dataset.features y = dataset.labels print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(x, y) classifier.print() print("\n") print("Tree visualisation graphically") print("\n") print("\n") print("\n") classifier.printImageTree() print("\n") print("\n") print("\n") print("\n")
############################################################################## import numpy as np from classification import DecisionTreeClassifier from eval import Evaluator if __name__ == "__main__": print("Loading the training dataset...") x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2], [5, 2, 6]]) y = np.array(["A", "A", "A", "C", "C", "C"]) print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(x, y) print("Loading the test set...") x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]]) y_test = np.array(["A", "A", "C", "C"]) predictions = classifier.predict(x_test) print("Predictions: {}".format(predictions)) classes = ["A", "C"] print("Evaluating test predictions...") evaluator = Evaluator()
def calc_stats(self, test_path, path_to_data, plt_title, prune, pruneAggressively): #load dataset, atttribs, labels d_subset = ClassifierDataset() d_subset.initFromFile(path_to_data) attribs = d_subset.attrib labels = d_subset.labels ds_test = ClassifierDataset() ds_test.initFromFile(test_path) test_attribs = ds_test.attrib test_labels = ds_test.labels #train and predict print("TRAINING") tree = DecisionTreeClassifier() tree.train(attribs, labels) print("FINISHED TRAINING") if prune == True: print("PRUNING") validationDataset = ClassifierDataset() validationDataset.initFromFile(val_path) Prune(tree, validationDataset.attrib, validationDataset.labels, pruneAggressively) print("FINISHED PRUNING") predictions = tree.predict(test_attribs) evaluator = Evaluator() c_matrix = evaluator.confusion_matrix(predictions, test_labels) print(c_matrix) a = ["A", "C", "E", "G", "O", "Q"] b = path_to_data[7:-4] if prune: if pruneAggressively: b = b + "_aggressively_pruned" else: b += "_pruned" else: b += "_not_pruned" plot_confusion_matrix(c_matrix, a, plt_title) print(" ") print("Accuracy: " + str(evaluator.accuracy(c_matrix))) print(" ") precision, macro_p = evaluator.precision(c_matrix) recall, macro_r = evaluator.recall(c_matrix) f1, macro_f1 = evaluator.f1_score(c_matrix) p = np.append(precision, macro_p) r = np.append(recall, macro_r) f1 = np.append(f1, macro_f1) performance_matrix = np.vstack((p, np.vstack((r, f1)))) print(performance_matrix) plot_other_stats(performance_matrix, plt_title) ''' print("Precision: " + str(precision)) print("Recall: " + str(recall)) print("F1 Score: " + str(f1))''' print(" ") print("Macro avg recall:" + str(macro_r)) print("Macro avg precision:" + str(macro_p)) print("Macro avg f1:" + str(macro_f1)) print(" ")
if __name__ == "__main__": print("Loading the training dataset...") x = np.array([ [5,7,1], [4,6,2], [4,6,3], [1,3,1], [2,1,2], [5,2,6] ]) y = np.array(["A", "A", "A", "C", "C", "C"]) print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier.fit(x, y) print("Loading the test set...") x_test = np.array([ [1,6,3], [0,5,5], [1,5,0], [2,4,2] ]) y_test = np.array(["A", "A", "C", "C"]) print("Making predictions on the test set...") predictions = classifier.predict(x_test)
np.set_printoptions(formatter={'float': '{: 0.4f}'.format}) macro_f = round(macro_f, 4) return (f, macro_f) # Example usage if __name__ == "__main__": # Create an evaluator and load the datasets evaluator = Evaluator() training_dataset = Dataset.load_from_file("train_full.txt") test_dataset = Dataset.load_from_file("test.txt") # Create a tree, train it and test it on the datasets tree = DecisionTreeClassifier() trained_tree = tree.train(training_dataset.attributes, training_dataset.labels) prediction = tree.predict(test_dataset.attributes) # Compute the confusion matrix confusion = evaluator.confusion_matrix(prediction, test_dataset.labels) print(f'Confusion matrix: {confusion}') # Compute the accuracy a = evaluator.accuracy(confusion) print(f'Accuracy: {a}') # Compute the precision p, macro_p = evaluator.precision(confusion) print(f'Precision: {p}')
print("accuracy", accuracy) print("precision", precision) print("recall", recall) print("f1", f1) return if __name__ == "__main__": #QUESTION 1 print("Question 1") print("Loading the data") filename = "data/train_full.txt" classifier = DecisionTreeClassifier() x, y = classifier.load_data(filename) #QUESTION 2 print("Question 2") print("Training the tree with two different methods") print("Training the decision tree...") classifier = classifier.train(x, y) print("Loading the test set...") filename = "data/test.txt" x_test, y_test = classifier.load_data(filename) print("\nPredicting on test.txt data with 4 different trees")
from classification import DecisionTreeClassifier x_train, y_train = reader.read_from_csv("data/train_full.txt") x_test, y_test = reader.read_from_csv("data/test.txt") k = 10 seed = 42 folds = metrics.split_k_fold(x_train, k, seed) trees = [] # Holds all k trees trained using train sets selected by folds # Train trees for fold in range(k): # Create a new tree, train it and store it in trees collection new_tree = DecisionTreeClassifier() new_tree_accuracy = metrics.train_and_eval_kth(new_tree, folds, x_train, y_train, fold) trees.append(new_tree) # Print out the result for this fold print("tree accuracy (validation)", fold, " = ", new_tree_accuracy) print("tree accuracy (test)", fold, " = ", metrics.accuracy(new_tree.predict(x_test), y_test)) # Find predictions for each folds predictions = [] for tree in trees: prediction = tree.predict(x_test) predictions.append(prediction)
import numpy as np import reader import metrics from classification import DecisionTreeClassifier # Load data x, y = reader.read_from_csv("data/train_full.txt") x_val, y_val = reader.read_from_csv("data/validation.txt") x_test, y_test = reader.read_from_csv("data/test.txt") # 1. Train grid search on prepruning tree_preprune = DecisionTreeClassifier() options = {"max_tree_depth": [13, 15, 17], "min_sample_size": [2, 3, 4]} print(metrics.grid_search(tree_preprune, x, y, x_val, y_val, options)) acc1 = metrics.accuracy(tree_preprune.predict(x_test), y_test) # 2. Train existing classifier on postpruning (call post prune on 1) tree_preprune.prune(x_val, y_val) acc2 = metrics.accuracy(tree_preprune.predict(x_test), y_test) # 3. Train new classifier on postpruning tree_postprune = DecisionTreeClassifier() tree_postprune.fit(x, y) tree_postprune.prune(x_val, y_val) acc3 = metrics.accuracy(tree_postprune.predict(x_test), y_test) print("")
def combine_cross_validation(k, filename): """ Performs cross validation on a dataset Parameters ---------- k : int number of times dataset is split filename : string name of the file to load the dataset Returns ------- list of ints containing the accuracies of each split int global error estimate """ dataset_from_file = Dataset.load_from_file(filename) file_path = "./data/" + filename dataset = np.loadtxt(file_path, dtype=str, delimiter=',') np.random.shuffle(dataset) subsets = np.array_split(dataset, k) accuracies = [] test = Dataset.load_from_file("test.txt") all_predictions = np.zeros((test.labels.shape[0], k), dtype=np.object) won_vote = np.zeros((test.labels.shape[0]), dtype=np.object) for i in range(k): train = np.delete(subsets, i, axis=0) train = np.concatenate(train) train_att = train[:, :-1].astype(int) train_labels = train[:, -1] test = subsets[i] test_att = test[:, :-1].astype(int) test_labels = test[:, -1] tree = DecisionTreeClassifier() tree = tree.train(train_att, train_labels) test = Dataset.load_from_file("test.txt") prediction = tree.predict(test.attributes) #Put all the predictions into a numpy array, to vote on most freq label for index in range(len(prediction)): all_predictions[index][i] = prediction[index] #Calculate the accuracy of each model and put into a list evaluator = Evaluator() confusion = evaluator.confusion_matrix(prediction, test.labels) a = evaluator.accuracy(confusion) accuracies.append(a) print(accuracies) global_error_estimate = np.mean(accuracies) np.set_printoptions(formatter={'float': '{: 0.4f}'.format}) #Create predictions with most frequent label from all k models for index, prediction in enumerate(all_predictions): #Ensure there are only labels in the array prediction = np.delete(prediction, np.argwhere(prediction == 0)) #Get the label with the highest frequency unique, position = np.unique(prediction, return_inverse=True) count = np.bincount(position) pos_with_max_count = count.argmax() winning_label = unique[pos_with_max_count] won_vote[index] = winning_label #Calculate the accucacy of the combined model print(f'WINNERS: {won_vote}') evaluator_w = Evaluator() confusion_w = evaluator_w.confusion_matrix(won_vote, test.labels) a_w = evaluator_w.accuracy(confusion_w) return a_w
import reader import numpy as np import metrics from classification import DecisionTreeClassifier x_train, y_train = reader.read_from_csv("data/train_full.txt") x_test, y_test = reader.read_from_csv("data/test.txt") # Trees initialisation tree_full = DecisionTreeClassifier() mean_acc, std_dev_acc = \ metrics.k_cross_val(tree_full, x_train, y_train, k=10, seed=42) # Q3.2 print('Q3.2') print(mean_acc, std_dev_acc) y_pred = tree_full.predict(x_test) test_accuracy = metrics.accuracy(y_pred, y_test) test_precision = metrics.precision(y_pred, y_test) test_recall = metrics.recall(y_pred, y_test) test_f_score = metrics.f_score(y_pred, y_test) labels, confusion_matrix = metrics.conf_matrix(y_pred, y_test) # Q3.3 print('Q3.3') print(labels) print(confusion_matrix) print('Test acc:', test_accuracy)
pruner.prune_tree() preds = pruner.tree.predict(test.features) confusion = eval.confusion_matrix(preds, test.labels) new_accuracy = eval.accuracy(confusion) new_depth = pruner.max_depth() return old_accuracy, new_accuracy, old_depth, new_depth if __name__ == "__main__": headers = [ "x-box", "y-box", "width", "high", "onpix", "x-bar", "y-bar", "x2bar", "y2bar", "xybar", "x2ybr", "xy2br", "x-ege", "xegvy", "y-ege", "yegvx" ] valid = Dataset("data/validation.txt") test = Dataset("data/test.txt") eval = Evaluator() full_model = DecisionTreeClassifier(headers) full_model.deserialise_model("data/model_full.pickle") noisy_model = DecisionTreeClassifier(headers) noisy_model.deserialise_model("data/model_noisy.pickle") print(setup(full_model, valid, test)) print() print(setup(noisy_model, valid, test))
print() print() print("Macro-averaged Precision: {:.2f}".format(macro_p)) print("Macro-averaged Recall: {:.2f}".format(macro_r)) print("Macro-averaged F1: {:.2f}".format(macro_f)) if __name__ == "__main__": print("Loading the datasets...") trainingData = dataReader.parseFile("data/train_full.txt") validationData = dataReader.parseFile("data/validation.txt") testData = dataReader.parseFile("data/test.txt") print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(trainingData[0], trainingData[1]) predictions = classifier.predict(testData[0]) print("Pre prunning predictions: {}".format(predictions)) print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, testData[1]) printMetric(confusion) print("Pruning the decision tree...") classifier.prune(validationData) predictions = classifier.predict(testData[0]) print("Post prunning predictions: {}".format(predictions))
from classification import DecisionTreeClassifier from dataset import ClassifierDataset from prune import Prune import matplotlib.pyplot as plt pathToSimple1 = './data/simple1.txt' pathToSimple2 = './data/simple2.txt' pathToTest = './data/test.txt' pathToToy = './data/toy.txt' pathToToy2 = './data/toy2.txt' pathToFull = './data/train_full.txt' pathToNoisy = './data/train_noisy.txt' pathToSub = './data/train_sub.txt' pathToValid = './data/validation.txt' pathToExample = './data/example.txt' dataset = ClassifierDataset() dataset.initFromFile(pathToFull) # CHANGE PATH HERE dtc = DecisionTreeClassifier() dtc.train(dataset.attrib, dataset.labels) validationDataset = ClassifierDataset() validationDataset.initFromFile(pathToValid) # Uncomment the below line to Prune # Prune(dtc, validationDataset.attrib, validationDataset.labels, True) # first arg: Decision Tree Classifier object; second arg: max tree depth, third arg: compact mode # fourth arg: filename, fifth arg: format tv = TreeVisualiser(dtc, None, True, 'full_3_prune', 'pdf')
print(precision) print(f"Average Precision: {avg_prec}") print("Recall:") print(recall) print(f"Average Recall: {avg_rec}") print("F1_score:") print(f1_score) print(f"Average F1 Score: {avg_f1}") x_train, y_train = reader.read_from_csv("data/train_full.txt") x_sub, y_sub = reader.read_from_csv("data/train_sub.txt") x_noisy, y_noisy = reader.read_from_csv("data/train_noisy.txt") x_test, y_test = reader.read_from_csv("data/test.txt") # Trees initialisation tree_full = DecisionTreeClassifier() tree_sub = DecisionTreeClassifier() tree_noisy = DecisionTreeClassifier() # Fitting trees tree_full.fit(x_train, y_train) tree_sub.fit(x_sub, y_sub) tree_noisy.fit(x_noisy, y_noisy) # Evaluation eval_classifier(tree_full, x_test, y_test) eval_classifier(tree_sub, x_test, y_test) eval_classifier(tree_noisy, x_test, y_test)
confusion = evaluator.confusion_matrix(predictions_after, annotation) accuracy_after = evaluator.accuracy(confusion) # Restore node if accuracy dropped if (accuracy_after < accuracy_before): node.left.label = temp_label_left node.right.label = temp_label_right node.label = None node.rule = temp_rule # Example usage if __name__ == "__main__": # Create and train a tree training_dataset = Dataset.load_from_file("train_full.txt") tree = DecisionTreeClassifier() tree = tree.train(training_dataset.attributes, training_dataset.labels) # Print tree before pruning tree.print() # Evaluate predictions before pruning evaluator = Evaluator() validation_dataset = Dataset.load_from_file("validation.txt") predictions_before = tree.predict(validation_dataset.attributes) confusion = evaluator.confusion_matrix( predictions_before, validation_dataset.labels) accuracy_before = evaluator.accuracy(confusion) print(f'Accuracy before: {accuracy_before}') # Perform pruning