def prune(node, dataset): """ Recursively prunes a decision tree classifier Parameters ---------- root : Node the node being considered for pruning dataset: Dataset the dataset being used for pruning """ if not node.is_leaf(): two_leaves = node.right.is_leaf() and node.left.is_leaf() if not two_leaves: if (node.left.is_node()): prune(node.left, dataset) if (node.right.is_node()): prune(node.right, dataset) two_leaves = node.right.is_leaf() and node.left.is_leaf() if two_leaves: annotation = dataset.labels attributes = dataset.attributes # Try and prune current node # Calculate accuracy before pruning evaluator = Evaluator() predictions_before = tree.predict(attributes) confusion = evaluator.confusion_matrix(predictions_before, annotation) accuracy_before = evaluator.accuracy(confusion) # Store leaves and rule temporarily temp_label_left = node.left.label temp_label_right = node.right.label temp_rule = node.rule # Prune current node node.label = node.majority_label node.left.label = None node.right.label = None node.rule = None # Calculate accuracy after pruning predictions_after = tree.predict(attributes) confusion = evaluator.confusion_matrix(predictions_after, annotation) accuracy_after = evaluator.accuracy(confusion) # Restore node if accuracy dropped if (accuracy_after < accuracy_before): node.left.label = temp_label_left node.right.label = temp_label_right node.label = None node.rule = temp_rule
def cross_validation(x, y, k): xpart, ypart = data_split(x, y, k) accuracy = np.zeros(k) classifiers = np.empty(k, dtype=object) for i in range(k): # split data correctly xval = xpart[i] yval = ypart[i] xtrain = np.delete(xpart, i, 0).reshape((k - 1) * xval.shape[0], xval.shape[1]) ytrain = np.delete(ypart, i, 0).reshape((k - 1) * xval.shape[0], 1) # train on training slice classifiers[i] = DecisionTreeClassifier() classifiers[i] = classifiers[i].train(xtrain, ytrain) #predict for test class predictions = classifiers[i].predict(xval) # validate using statistics eval = Evaluator() confusion = eval.confusion_matrix(predictions, yval) accuracy[i] = eval.accuracy(confusion) return accuracy, classifiers
def getAccuracy(self): evaluator = Evaluator() predictions = self.decisionTreeClassifier.predict( self.validationAttrib) c_matrix = evaluator.confusion_matrix(predictions, self.validationLabel) return evaluator.accuracy(c_matrix)
def q4confmat(full_dat, noisy_dat): ref_dict = full_dat.getDictionary() # ground truth labels annotations = [] for attrib in noisy_dat.attrib: attribString = ','.join(str(v) for v in attrib) if not attribString in ref_dict: print("ERROR: attribString not present!") continue annotations.append(ref_dict[attribString]) evaluator = Evaluator() c_matrix = evaluator.confusion_matrix(noisy_dat.labels, annotations) print(c_matrix) target_names = ["A", "C", "E", "G", "O", "Q"] plot_confusion_matrix(c_matrix, target_names, "Noisy vs Full") precision, macro_p = evaluator.precision(c_matrix) recall, macro_r = evaluator.recall(c_matrix) f1, macro_f1 = evaluator.f1_score(c_matrix) p = np.append(precision, macro_p) r = np.append(recall, macro_r) f1 = np.append(f1, macro_f1) performance_matrix = np.vstack((p, np.vstack((r, f1)))) print(performance_matrix) plot_other_stats(performance_matrix, "Train_noisy") return
def old_test(): # data_read("data/toy.txt") prediction = ["A", "B"] annotation = ["A", "A"] class_labels = ["B", "A"] obj = Evaluator() matrix = obj.confusion_matrix(prediction, annotation, class_labels) print(str.format('{0:.15f}', obj.accuracy(matrix))) print(obj.precision(matrix)) print(obj.recall(matrix)) print(obj.f1_score(matrix))
def prune_tree_reduced_error(self, tree, x_val, y_val): """ Function to accept prunes which increase the tree's accuracy, otherwise ignore Args: tree (dict) - tree to be pruned x_val (2D array) - 2D array of attributes of validation set where each row is a differnt sample and each column is a differnt attribute y_val (1D array) - 1D array of correct labels for x_val validation data Output: tree (dict or str) - tree pruned such that any additional pruning would lower predictive accuracy on validation set. """ classifier = DecisionTreeClassifier() classifier.is_trained = True predictions = classifier.predict(x_val) eval = Evaluator() confusion = eval.confusion_matrix(predictions, y_val) root_accuracy = eval.accuracy(confusion) print("Results on Validation set") print("Original Accuracy: ", root_accuracy) is_pruned = True while (is_pruned and isinstance(tree, dict)): #make copy of tree then attempt to prune copy tree_copy = copy.deepcopy(tree) (is_pruned, tree_copy, tree) = self.prune(tree_copy, tree) if is_pruned: #compare accuracy of pruned tree to original new_predictions = classifier.predict(x_val, tree_copy) new_confusion = eval.confusion_matrix(new_predictions, y_val) new_accuracy = eval.accuracy(new_confusion) if new_accuracy >= root_accuracy: #if greater or equal accuracy make tree = copy root_accuracy = new_accuracy tree = copy.deepcopy(tree_copy) print("New Accuracy: ", root_accuracy) return tree
def cross_validation(k, filename): """ Performs cross validation on a dataset Parameters ---------- k : int number of times dataset is split filename : string name of the file to load the dataset Returns ------- list of ints containing the accuracies of each split int global error estimate """ file_path = "./data/" + filename dataset = np.loadtxt(file_path, dtype=str, delimiter=',') np.random.shuffle(dataset) subsets = np.array_split(dataset, k) accuracies = [] for i in range(k): train = np.delete(subsets, i, axis=0) train = np.concatenate(train) train_att = train[:, :-1].astype(int) train_labels = train[:, -1] test = subsets[i] test_att = test[:, :-1].astype(int) test_labels = test[:, -1] tree = DecisionTreeClassifier() tree = tree.train(train_att, train_labels) prediction = tree.predict(test_att) evaluator = Evaluator() confusion = evaluator.confusion_matrix(prediction, test_labels) a = evaluator.accuracy(confusion) accuracies.append(a) global_error_estimate = np.mean(accuracies) np.set_printoptions(formatter={'float': '{: 0.4f}'.format}) return accuracies, global_error_estimate
def main(): print("Loading the training dataset...") x = np.array([[5, 7, 1], [4, 6, 2], [4, 6, 3], [1, 3, 1], [2, 1, 2], [5, 2, 6]]) y = np.array(["A", "A", "A", "C", "C", "C"]) print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(x, y) print("Loading the test set...") x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]]) y_test = np.array(["A", "A", "C", "C"]) predictions = classifier.predict(x_test) print("Predictions: {}".format(predictions)) classes = ["A", "C"] print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, y_test) print("Confusion matrix:") print(confusion) accuracy = evaluator.accuracy(confusion) print() print("Accuracy: {}".format(accuracy)) (p, macro_p) = evaluator.precision(confusion) (r, macro_r) = evaluator.recall(confusion) (f, macro_f) = evaluator.f1_score(confusion) print() print("Class: Precision, Recall, F1") for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)): print("{}: {:.2f}, {:.2f}, {:.2f}".format(classes[i], p1, r1, f1)) print() print("Macro-averaged Precision: {:.2f}".format(macro_p)) print("Macro-averaged Recall: {:.2f}".format(macro_r)) print("Macro-averaged F1: {:.2f}".format(macro_f))
def print_stats(predictions, y_test): eval = Evaluator() confusion = eval.confusion_matrix(predictions, y_test) accuracy = eval.accuracy(confusion) precision = eval.precision(confusion) recall = eval.recall(confusion) f1 = eval.f1_score(confusion) print("confusion", confusion) print("accuracy", accuracy) print("precision", precision) print("recall", recall) print("f1", f1) return
def test_DecisionTreeClassifier(dataset_filename: str = "toy.txt", should_load_file=False): # train extless_filename = dataset_filename[:-4] start = time.time() saved_tree_file = None if should_load_file: saved_tree_file = "tree_" + extless_filename + ".obj" cl = DecisionTreeClassifier(saved_tree_file=saved_tree_file) dataset = data_read("data/" + dataset_filename) unique_lbls = np.unique([e.label for e in dataset.entries]) x, y = dataset.shim_to_arrays() cl.train(x, y) cl.tree.save_tree("tree_" + extless_filename + ".obj") visualize_tree(cl.tree, save_filename=f"visualize_tree_{extless_filename}.txt", max_depth=8) duration = time.time() - start print("duration: ", duration) # predict test_dataset = data_read("data/test.txt") x_test, y_test = test_dataset.shim_to_arrays() preds = cl.predict(x_test) # preds = [random.choice('ACEGOQ') # for _ in range(len(y_test))] # testing random # evaluate ev = Evaluator() matrix = ev.confusion_matrix(preds, y_test, unique_lbls) print("real accuracy: ", accuracy_score(y_test, preds)) print("\nour calc accuracy: ", str.format('{0:.15f}', ev.accuracy(matrix))) print("\n precision:", precision_score(y_test, preds, average="macro")) print("\n our precision: ", ev.precision(matrix)) print("\nreal recall: ", recall_score(y_test, preds, average="macro")) print("\n our recall: ", ev.recall(matrix)) print("\n f1_score", f1_score(y_test, preds, average="macro")) print("\n f1_score: ", ev.f1_score(matrix)) print(matrix)
if __name__ == "__main__": print("Loading the datasets...") trainingData = dataReader.parseFile("data/train_full.txt") validationData = dataReader.parseFile("data/validation.txt") testData = dataReader.parseFile("data/test.txt") print("Training the decision tree...") classifier = DecisionTreeClassifier() classifier = classifier.train(trainingData[0], trainingData[1]) predictions = classifier.predict(testData[0]) print("Pre prunning predictions: {}".format(predictions)) print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, testData[1]) printMetric(confusion) print("Pruning the decision tree...") classifier.prune(validationData) predictions = classifier.predict(testData[0]) print("Post prunning predictions: {}".format(predictions)) print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, testData[1]) printMetric(confusion) classifier.plot_tree()
classifier = classifier.train(x, y) print("Loading the test set...") x_test = np.array([[1, 6, 3], [0, 5, 5], [1, 5, 0], [2, 4, 2]]) y_test = np.array(["A", "A", "C", "C"]) predictions = classifier.predict(x_test) print("Predictions: {}".format(predictions)) classes = ["A", "C"] print("Evaluating test predictions...") evaluator = Evaluator() confusion = evaluator.confusion_matrix(predictions, y_test) print("Confusion matrix:") print(confusion) accuracy = evaluator.accuracy(confusion) print() print("Accuracy: {}".format(accuracy)) (p, macro_p) = evaluator.precision(confusion) (r, macro_r) = evaluator.recall(confusion) (f, macro_f) = evaluator.f1_score(confusion) print() print("Class: Precision, Recall, F1") for (i, (p1, r1, f1)) in enumerate(zip(p, r, f)):
def combine_cross_validation(k, filename): """ Performs cross validation on a dataset Parameters ---------- k : int number of times dataset is split filename : string name of the file to load the dataset Returns ------- list of ints containing the accuracies of each split int global error estimate """ dataset_from_file = Dataset.load_from_file(filename) file_path = "./data/" + filename dataset = np.loadtxt(file_path, dtype=str, delimiter=',') np.random.shuffle(dataset) subsets = np.array_split(dataset, k) accuracies = [] test = Dataset.load_from_file("test.txt") all_predictions = np.zeros((test.labels.shape[0], k), dtype=np.object) won_vote = np.zeros((test.labels.shape[0]), dtype=np.object) for i in range(k): train = np.delete(subsets, i, axis=0) train = np.concatenate(train) train_att = train[:, :-1].astype(int) train_labels = train[:, -1] test = subsets[i] test_att = test[:, :-1].astype(int) test_labels = test[:, -1] tree = DecisionTreeClassifier() tree = tree.train(train_att, train_labels) test = Dataset.load_from_file("test.txt") prediction = tree.predict(test.attributes) #Put all the predictions into a numpy array, to vote on most freq label for index in range(len(prediction)): all_predictions[index][i] = prediction[index] #Calculate the accuracy of each model and put into a list evaluator = Evaluator() confusion = evaluator.confusion_matrix(prediction, test.labels) a = evaluator.accuracy(confusion) accuracies.append(a) print(accuracies) global_error_estimate = np.mean(accuracies) np.set_printoptions(formatter={'float': '{: 0.4f}'.format}) #Create predictions with most frequent label from all k models for index, prediction in enumerate(all_predictions): #Ensure there are only labels in the array prediction = np.delete(prediction, np.argwhere(prediction == 0)) #Get the label with the highest frequency unique, position = np.unique(prediction, return_inverse=True) count = np.bincount(position) pos_with_max_count = count.argmax() winning_label = unique[pos_with_max_count] won_vote[index] = winning_label #Calculate the accucacy of the combined model print(f'WINNERS: {won_vote}') evaluator_w = Evaluator() confusion_w = evaluator_w.confusion_matrix(won_vote, test.labels) a_w = evaluator_w.accuracy(confusion_w) return a_w
def calc_stats(self, test_path, path_to_data, plt_title, prune, pruneAggressively): #load dataset, atttribs, labels d_subset = ClassifierDataset() d_subset.initFromFile(path_to_data) attribs = d_subset.attrib labels = d_subset.labels ds_test = ClassifierDataset() ds_test.initFromFile(test_path) test_attribs = ds_test.attrib test_labels = ds_test.labels #train and predict print("TRAINING") tree = DecisionTreeClassifier() tree.train(attribs, labels) print("FINISHED TRAINING") if prune == True: print("PRUNING") validationDataset = ClassifierDataset() validationDataset.initFromFile(val_path) Prune(tree, validationDataset.attrib, validationDataset.labels, pruneAggressively) print("FINISHED PRUNING") predictions = tree.predict(test_attribs) evaluator = Evaluator() c_matrix = evaluator.confusion_matrix(predictions, test_labels) print(c_matrix) a = ["A", "C", "E", "G", "O", "Q"] b = path_to_data[7:-4] if prune: if pruneAggressively: b = b + "_aggressively_pruned" else: b += "_pruned" else: b += "_not_pruned" plot_confusion_matrix(c_matrix, a, plt_title) print(" ") print("Accuracy: " + str(evaluator.accuracy(c_matrix))) print(" ") precision, macro_p = evaluator.precision(c_matrix) recall, macro_r = evaluator.recall(c_matrix) f1, macro_f1 = evaluator.f1_score(c_matrix) p = np.append(precision, macro_p) r = np.append(recall, macro_r) f1 = np.append(f1, macro_f1) performance_matrix = np.vstack((p, np.vstack((r, f1)))) print(performance_matrix) plot_other_stats(performance_matrix, plt_title) ''' print("Precision: " + str(precision)) print("Recall: " + str(recall)) print("F1 Score: " + str(f1))''' print(" ") print("Macro avg recall:" + str(macro_r)) print("Macro avg precision:" + str(macro_p)) print("Macro avg f1:" + str(macro_f1)) print(" ")
def prune(tree: BinTree): vld_dataset = data_read("data/validation.txt") x_val, y_val = vld_dataset.shim_to_arrays() ev = Evaluator() for i in range(10): print(f"----prune attempt {i + 1}---") tree.prune(node=tree.root_node, og_vld_feats=x_val, og_vld_lbls=y_val, dataset=vld_dataset, ev=ev, is_aggressive=False) if __name__ == "__main__": train_file = "train_noisy" dataset = data_read(f"data/{train_file}.txt") tree = BinTree(dataset, f"tree_{train_file}.obj") test_dataset = data_read("data/test.txt") ev = Evaluator() x_test, y_test = test_dataset.shim_to_arrays() preds = [tree.predict(x) for x in x_test] matrix = ev.confusion_matrix(preds, y_test) print("test accuracy before pruning:", ev.accuracy(matrix)) prune(tree) preds = [tree.predict(x) for x in x_test] matrix = ev.confusion_matrix(preds, y_test) print("test accuracy after pruning:", ev.accuracy(matrix))
class DecisionTreePruner(object): def __init__(self, tree, valid): self.tree = tree self.valid = valid self.eval = Evaluator() def _prune_children(self, parent, node, direction): left_pred, right_pred = node.left.prediction, node.right.prediction left_count = parent.counts[left_pred] right_count = parent.counts[right_pred] new_pred = left_pred if left_count > right_count else right_pred if direction == 'L': parent.left = LeafNode(str(new_pred), prediction=new_pred) else: parent.right = LeafNode(str(new_pred), prediction=new_pred) def _unprune_children(self, parent, node, direction): if direction == 'L': parent.left = node else: parent.right = node def _get_accuracy(self): preds = self.tree.predict(valid.features) confusion = self.eval.confusion_matrix(preds, valid.labels) accuracy = self.eval.accuracy(confusion) return accuracy def _find_prunable_nodes(self): def prunable(node): return type(node.left) is LeafNode and type(node.right) is LeafNode queue = deque([(self.tree.root, 1)]) prunable_nodes = [] while queue: node, depth = queue.popleft() if type(node) is not LeafNode: for i, child in enumerate([node.left, node.right]): if prunable(child): direction = 'L' if i == 0 else 'R' prunable_nodes.append((node, child, direction, depth)) else: queue.append((child, depth + 1)) return prunable_nodes def max_depth(self): queue = deque([(self.tree.root, 0)]) max_depth = 0 while queue: node, depth = queue.popleft() max_depth = max(max_depth, depth) if type(node) is not LeafNode: queue.append((node.left, depth + 1)) queue.append((node.right, depth + 1)) return max_depth def prune_tree(self): unpruned_accuracy = self._get_accuracy() print(unpruned_accuracy) improved = True while improved: max_pruned_accuracy = 0 prunable_nodes = self._find_prunable_nodes() prunable_nodes.sort(key=lambda n: n[3]) best = None for parent, node, direction, depth in prunable_nodes: self._prune_children(parent, node, direction) pruned_accuracy = self._get_accuracy() if pruned_accuracy > max_pruned_accuracy: max_pruned_accuracy = pruned_accuracy best = (parent, node, direction) self._unprune_children(parent, node, direction) improved = max_pruned_accuracy > unpruned_accuracy if improved: parent, best_node, direction = best print(max_pruned_accuracy) unpruned_accuracy = max_pruned_accuracy self._prune_children(parent, best_node, direction)
print("Loading the test set...") filename = "data/test.txt" x_test, y_test = classifier.load_data(filename) print("\nPredicting on test.txt data with 4 different trees") #Load the evaulator class eval = Evaluator() prune = Pruning() print("\nTree 2 unpruned") tree_3 = np.load('simple_tree.npy', allow_pickle=True).item() predictions = classifier.predict(x_test) confusion = eval.confusion_matrix(predictions, y_test) accuracy_3 = eval.accuracy(confusion) print("number of leaves:", prune.count_leaves(tree_3)) print("Tree 2 unpruned Accuracy: " + str(np.round(accuracy_3 * 100, 2))) print("\nTree 2 pruned") tree_4 = np.load('simple_tree_pruned.npy', allow_pickle=True).item() predictions = classifier.predict(x_test, tree_4) confusion = eval.confusion_matrix(predictions, y_test) accuracy_4 = eval.accuracy(confusion) print("number of leaves:", prune.count_leaves(tree_4)) print("Tree 2 pruned Accuracy: " + str(np.round(accuracy_4 * 100, 2))) print("Question 2.3") print("Printing the tree") classifier.print_tree(tree_3, "Method_2_UnPruned.pdf")
# Example usage if __name__ == "__main__": # Create and train a tree training_dataset = Dataset.load_from_file("train_full.txt") tree = DecisionTreeClassifier() tree = tree.train(training_dataset.attributes, training_dataset.labels) # Print tree before pruning tree.print() # Evaluate predictions before pruning evaluator = Evaluator() validation_dataset = Dataset.load_from_file("validation.txt") predictions_before = tree.predict(validation_dataset.attributes) confusion = evaluator.confusion_matrix( predictions_before, validation_dataset.labels) accuracy_before = evaluator.accuracy(confusion) print(f'Accuracy before: {accuracy_before}') # Perform pruning pruning(tree, validation_dataset) # Print tree after pruning tree.print() # Evaluate predictions after pruning predictions_after = tree.predict(validation_dataset.attributes) confusion = evaluator.confusion_matrix( predictions_after, validation_dataset.labels) accuracy_after = evaluator.accuracy(confusion) print(f'Accuracy after: {accuracy_after}')