def k_fold_cross_validation(data_set, k, pruning=False): accuracy = np.zeros(k) tree = cls.DecisionTreeClassifier() best_tree = cls.DecisionTreeClassifier() max_accuracy = 0 trees = [] prePruneConfMatrix = [] postPruneConfMatrix = [] for i in range(1, k + 1): # Split Data into training and testing data split = split_set(data_set, k, i, pruning) testing = split[0] training = split[1] training_x = training[:, :-1] training_y = [chr(i) for i in training.T[-1]] # Train tree testing_y = [chr(i) for i in testing.T[-1]] trees.append(cls.DecisionTreeClassifier()) trees[i - 1].train(training_x, training_y) tree = trees[i - 1] if pruning: predictions = tree.predict(testing) confusion = ev.Evaluator.confusion_matrix(predictions, testing_y) prePruneConfMatrix.append(confusion) validation = split[2] tree.prune( (validation[:, :-1], [chr(i) for i in validation[:, -1]])) predictions = tree.predict(testing) # Evaluation metrics eval = ev.Evaluator() testing_y = [chr(i) for i in testing.T[-1]] confusion = eval.confusion_matrix(predictions, testing_y) accuracy[i - 1] = eval.accuracy(confusion) # Save tree with best accuracy confusion = ev.Evaluator.confusion_matrix(predictions, testing_y) postPruneConfMatrix.append(confusion) accuracy[i - 1] = ev.Evaluator.accuracy(confusion) if accuracy[i - 1] > max_accuracy: best_tree = trees[i - 1] max_accuracy = accuracy[i - 1] if pruning: print("Pre pruning metrics") analyseListOfConfMatrix(prePruneConfMatrix) print("Post pruning results") analyseListOfConfMatrix(postPruneConfMatrix) return accuracy, best_tree, trees
def k_fold_cv(self, x, y, k, scoring): k = len(y) if k > len(y) else k if k < 2: print("k must be at least 2 as the minimum number of splits is 2") exit(0) splits = self.k_split(x, y, k) scores = np.array([]) top_eval = 0 top_model = None for idx, fold in enumerate(splits): test_set, train_set = np.array([]), np.array([]) x_test = fold.T[:-1] y_test = fold.T[-1] for i in range(len(splits)): if i != idx: if len(train_set) > 0: train_set = np.vstack((train_set, splits[i])) else: train_set = splits[i] x_train = train_set.T[:-1] y_train = train_set.T[-1] model = cl.DecisionTreeClassifier() model.train(x_train.T, y_train) conf = self.confusion_matrix(model.predict(x_test.T), y_test) avg = None if scoring == "accuracy": avg = self.accuracy(conf) elif scoring == "precision": _, avg = self.precision(conf) elif scoring == "recall": _, avg = self.recall(conf) elif scoring == "f1": _, avg = self.f1_score(conf) else: print( "Invalid scoring metric. Please enter accuracy, precision, recall or f1" ) exit(0) if top_eval < avg: top_eval = avg top_nodel = model scores = np.append(scores, avg) return np.mean(scores), np.std(scores), top_model
def print_eval(train_data, test_data): data = dr.parseFile(train_data) x = data[0] y = data[1] tree = cp.DecisionTreeClassifier() tree.train(x, y) test = dr.parseFile(test_data) xtruth = test[0] ytruth = test[1] test = dr.mergeAttributesAndCharacteristics(xtruth, ytruth) predictions = tree.predict(test) e = Evaluator() a = e.confusion_matrix(ytruth, predictions) print("Confusion" + "\n" + str(a)) print("Accuracy: " + str(e.accuracy(a))) print("Recall: " + str(e.recall(a))) print("Precision: " + str(e.precision(a))) print("F1score: " + str(e.f1_score(a)))
repeats = 3 pathToSimple1 = './data/simple1.txt' pathToSimple2 = './data/simple2.txt' pathToTest = './data/test.txt' pathToToy = './data/toy.txt' pathToToy2 = './data/toy2.txt' pathToFull = './data/train_full.txt' pathToNoisy = './data/train_noisy.txt' pathToSub = './data/train_sub.txt' pathToValid = './data/validation.txt' dataset = ds.ClassifierDataset() dataset.initFromFile(pathToFull) dtc = cs.DecisionTreeClassifier() print("FULL") for i in range(repeats): cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time') dataset.initFromFile(pathToSub) print("\n\n\n=====\n\n\n\n") print("SUB") for i in range(repeats): cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time') dataset.initFromFile(pathToNoisy) print("\n\n\n=====\n\n\n\n") print("NOISY") for i in range(repeats): cProfile.run('dtc.train(dataset.attrib, dataset.labels)', None, 'time')
return np.mean(scores), np.std(scores), top_model if __name__ == "__main__": # ds = cl.Dataset() # ds.read("data/toy.txt") # # eval = Evaluator() # print(eval.k_fold_cv(ds.features, ds.labels, 4, "recall")) eval = Evaluator() pickles = [ "data/model_full.pickle", "data/model_noisy.pickle", "data/model_sub.pickle" ] models = [ cl.DecisionTreeClassifier(), cl.DecisionTreeClassifier(), cl.DecisionTreeClassifier() ] test = cl.Dataset("data/test.txt") model, pickle = models[2], pickles[2] model.deserialise_model(pickle) preds = model.predict(test.features) confusion = eval.confusion_matrix(preds, test.labels) print(confusion) print() print(eval.accuracy(confusion)) print() print(eval.precision(confusion)) print() print(eval.recall(confusion))
# Question 3.3 k = 10 accuracy, best_tree, k_trees = k_fold_cross_validation(full_data, k) # Print Accuracies and Standard Deviations for Question 3.3 print("Accuracy: " + str(round(accuracy.mean(), 4))) print("Standard Deviation: " + str(round(accuracy.std(), 4))) # Question 3.4 x = full_data[:, :-1] y = [chr(i) for i in full_data.T[-1]] testing_y = [chr(i) for i in test_data.T[-1]] # Train tree on train_full.txt full_trained = cls.DecisionTreeClassifier() full_trained.train(x, y) # Generate predictions full_predict = full_trained.predict(test_data) cross_predict = best_tree.predict(test_data) # Print results print_results(full_predict, testing_y, "Fully Trained") print_results(cross_predict, testing_y, "K-Fold Trained") # Question 3.5 # Get predictions for each tree trained in 3.3, k_trees k_predict = k_decision_trees(test_data, k, k_trees) print_results(k_predict, testing_y, "K-Fold Mode Predict")