Esempio n. 1
0
def sim3():
    filename = '../dataset/dataset3.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    plot_2D_dataset(dataset, "Simulation n.3")
    tree = build_tree(dataset, 5, 1)
    print("-" * 10 + " Sim.3 TREE " + "-" * 10)
    print_tree(tree.root, ['x', 'y'])
    print_tree_separating_2D(tree.root)
    show_plot()
Esempio n. 2
0
def sim4():
    filename = '../dataset/dataset3.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    train_accuracy = list()
    nodes_numbers = list()
    for i in range(1, 21):
        tree_size, predicted, expected = decision_tree_prediction_and_size(
            dataset, dataset, i, 1)
        acc = accuracy_metric(expected, predicted)
        train_accuracy.append(acc)
        nodes_numbers.append(tree_size)
    x = range(1, 21)
    plt.figure("Simulation n.4")
    line1, = plt.plot(x, train_accuracy, 'r', label='Train accuracy')
    line2, = plt.plot(x, nodes_numbers, 'g', label='Tree complexity(# nodes)')
    plt.legend(handles=[line1, line2], loc=4)
    plt.xticks(x)
    plt.xlabel('Maximum Tree Depth')
    plt.draw()
def parkinson_main(n_folds_outer_cross_val, n_folds_inner_cross_val, max_depth, min_size):

    if n_folds_outer_cross_val < 2 or n_folds_inner_cross_val < 2:
        raise ValueError("Illegal value parameter")

    filename = '../dataset/parkinson_recording_data.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    folds = cross_validation_split(dataset,n_folds_outer_cross_val)
    scores = list()
    outer_fold_number = 0

    # Outer k-fold cross validation
    for fold in folds:
        outer_fold_number += 1
        # Prepare train and test set
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None

        # Inner k-fold cross validation ( grid search )
        best_couple, inner_accuracy = two_grid_search_with_accuracy_return(decision_tree, n_folds_inner_cross_val, train_set, max_depth, min_size)
        # Evaluate results on outer cross validation test set
        predictions = decision_tree(train_set, test_set, best_couple[0], best_couple[1])
        actual = [row[-1] for row in fold]
        outer_accuracy = accuracy_metric(actual, predictions)
        print("-" * 10 + " Outer Fold n. " + str(outer_fold_number) + " " + "-" * 10)
        print("Best params selected by inner cross validation (max_depth,min_size): "+str(best_couple[0])+" "+str(best_couple[1]))
        print("Best params mean accuracy in the inner cross validation: " + str(inner_accuracy))
        print("Best params accuracy in the outer cross validation: " + str(outer_accuracy))
        scores.append(outer_accuracy)

    print("-" * 10 + " Final Results " + " " + "-" * 10)
    print("Total Accuracy mean: " + str(mean(scores)))
    print("Total Accuracy std dev: " + str(stdev(scores)))
    return scores
Esempio n. 4
0
def banknote_main(tr_percentages, number_repetitions, n_folds_2grid_search,
                  max_depth, min_size):

    if number_repetitions < 2 or n_folds_2grid_search < 2:
        raise ValueError("Illegal value parameter")

    filename = '../dataset/data_banknote_authentication.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    mean_accuracies = list()
    std_devs = list()
    # For each percentage of training split
    for percentage in tr_percentages:
        accuracies = list()
        # Repeat number_repetions times the random split and test validation for each split
        for run in range(1, number_repetitions + 1):
            train, test = random_training_test_split(dataset, percentage)
            result = two_grid_search(decision_tree, n_folds_2grid_search,
                                     train, max_depth, min_size)
            predictions = decision_tree(train, test, result[0], result[1])
            actual = [row[-1] for row in test]
            accuracy = accuracy_metric(actual, predictions)
            accuracies.append(accuracy)
        print("-" * 10 + " training split %" + str(percentage) + " " +
              "-" * 10)
        print("Accuracies of training split %" + str(percentage) + " : " +
              str(accuracies))
        mean_acc = mean(accuracies)
        std_dev = stdev(accuracies)
        print("Accuracy mean: " + str(mean_acc))
        print("Accuracy std dev: " + str(std_dev))
        mean_accuracies.append(mean_acc)
        std_devs.append(std_dev)

    plt.figure("BankNote dataset")
    plot_results(tr_percentages, mean_accuracies, std_devs)
    plt.show()