def sim3(): filename = '../dataset/dataset3.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) plot_2D_dataset(dataset, "Simulation n.3") tree = build_tree(dataset, 5, 1) print("-" * 10 + " Sim.3 TREE " + "-" * 10) print_tree(tree.root, ['x', 'y']) print_tree_separating_2D(tree.root) show_plot()
def sim4(): filename = '../dataset/dataset3.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) train_accuracy = list() nodes_numbers = list() for i in range(1, 21): tree_size, predicted, expected = decision_tree_prediction_and_size( dataset, dataset, i, 1) acc = accuracy_metric(expected, predicted) train_accuracy.append(acc) nodes_numbers.append(tree_size) x = range(1, 21) plt.figure("Simulation n.4") line1, = plt.plot(x, train_accuracy, 'r', label='Train accuracy') line2, = plt.plot(x, nodes_numbers, 'g', label='Tree complexity(# nodes)') plt.legend(handles=[line1, line2], loc=4) plt.xticks(x) plt.xlabel('Maximum Tree Depth') plt.draw()
def parkinson_main(n_folds_outer_cross_val, n_folds_inner_cross_val, max_depth, min_size): if n_folds_outer_cross_val < 2 or n_folds_inner_cross_val < 2: raise ValueError("Illegal value parameter") filename = '../dataset/parkinson_recording_data.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) folds = cross_validation_split(dataset,n_folds_outer_cross_val) scores = list() outer_fold_number = 0 # Outer k-fold cross validation for fold in folds: outer_fold_number += 1 # Prepare train and test set train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None # Inner k-fold cross validation ( grid search ) best_couple, inner_accuracy = two_grid_search_with_accuracy_return(decision_tree, n_folds_inner_cross_val, train_set, max_depth, min_size) # Evaluate results on outer cross validation test set predictions = decision_tree(train_set, test_set, best_couple[0], best_couple[1]) actual = [row[-1] for row in fold] outer_accuracy = accuracy_metric(actual, predictions) print("-" * 10 + " Outer Fold n. " + str(outer_fold_number) + " " + "-" * 10) print("Best params selected by inner cross validation (max_depth,min_size): "+str(best_couple[0])+" "+str(best_couple[1])) print("Best params mean accuracy in the inner cross validation: " + str(inner_accuracy)) print("Best params accuracy in the outer cross validation: " + str(outer_accuracy)) scores.append(outer_accuracy) print("-" * 10 + " Final Results " + " " + "-" * 10) print("Total Accuracy mean: " + str(mean(scores))) print("Total Accuracy std dev: " + str(stdev(scores))) return scores
def banknote_main(tr_percentages, number_repetitions, n_folds_2grid_search, max_depth, min_size): if number_repetitions < 2 or n_folds_2grid_search < 2: raise ValueError("Illegal value parameter") filename = '../dataset/data_banknote_authentication.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) mean_accuracies = list() std_devs = list() # For each percentage of training split for percentage in tr_percentages: accuracies = list() # Repeat number_repetions times the random split and test validation for each split for run in range(1, number_repetitions + 1): train, test = random_training_test_split(dataset, percentage) result = two_grid_search(decision_tree, n_folds_2grid_search, train, max_depth, min_size) predictions = decision_tree(train, test, result[0], result[1]) actual = [row[-1] for row in test] accuracy = accuracy_metric(actual, predictions) accuracies.append(accuracy) print("-" * 10 + " training split %" + str(percentage) + " " + "-" * 10) print("Accuracies of training split %" + str(percentage) + " : " + str(accuracies)) mean_acc = mean(accuracies) std_dev = stdev(accuracies) print("Accuracy mean: " + str(mean_acc)) print("Accuracy std dev: " + str(std_dev)) mean_accuracies.append(mean_acc) std_devs.append(std_dev) plt.figure("BankNote dataset") plot_results(tr_percentages, mean_accuracies, std_devs) plt.show()