Beispiel #1
0
 def __init__(self, image_file, label_file, patch_size=(101, 101)):
     self.input = prepared_dataset_image(image_file, border=patch_size)
     self.image_size = image_size(prepared_dataset_image(image_file))
     self.patch_size = patch_size
     width, height = self.image_size
     self.output = np.zeros((height, width))
     self.verbose = TT.verbose
     for (col, row, p) in load_csv(label_file):
         self.output[row, col] = 1.0
Beispiel #2
0
def sim3():
    filename = '../dataset/dataset3.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    plot_2D_dataset(dataset, "Simulation n.3")
    tree = build_tree(dataset, 5, 1)
    print("-" * 10 + " Sim.3 TREE " + "-" * 10)
    print_tree(tree.root, ['x', 'y'])
    print_tree_separating_2D(tree.root)
    show_plot()
Beispiel #3
0
def main():
    start_season = 1995
    end_season = 2020
    regular_season = True

    # Only need to update drafts once a year
    if False:
        drafts = util.get_drafts(start_season, end_season) # Takes a loonnnggg time to run
        util.save_csv("drafts.csv", drafts)
    else:
        drafts = util.load_csv("drafts.csv")

    if True:
        players = get_career_stats(start_season, end_season, regular_season)
        util.save_csv("players.csv", players)

        rosters = util.get_rosters()
        util.save_csv("rosters.csv", rosters)
    else:
        players = util.load_csv("players.csv")
        rosters = util.load_csv("rosters.csv")

    drafts = update_team_names(drafts)
    drafts = drafts.sort_values(["team.name", "year", "round"], ascending=[1, 0, 1])

    # Merge all data into one dataframe
    drafts['name_lower'] = drafts['prospect.fullName'].str.lower()
    players['name_lower'] = players['playerName'].str.lower()
    rosters['name_lower'] = rosters['fullName'].str.lower()
    draft_data = pd.merge(drafts, players, how="left", on="name_lower", sort=False, suffixes=("", "_x"))
    draft_data = pd.merge(draft_data, rosters, how="left", on="name_lower", sort=False, suffixes=("", "_y"))

    # Update positions and set statuses for each filter in the visuzalization.  Then get rid of unneeded columns.
    draft_data = set_statuses(draft_data)
    draft_data = clean_data(draft_data)
    draft_data = reduce_columns(draft_data)

    util.save_csv("draft_data.csv", draft_data)
Beispiel #4
0
    def positive(self):
        if hasattr(self, '_positive'):
            return self._positive, self._positive_size

        TT.debug("Collecting positive samples.")
        self._positive = {}
        self._positive_size = 0
        self._positive_expanded = {}
        for data_file, label_file in self.files:
            labels = load_csv(os.path.join(self.root_path, label_file))
            self._positive[data_file] = labels
            self._positive_size += len(labels)
            self._positive_expanded[data_file] = {}
            for col, row, p in labels:
                self._positive_expanded[data_file][index_at_pixel(col=col, row=row, size=self.image_size)] = p
        TT.debug("Found", self._positive_size, "positive samples.")
        return self.positive
Beispiel #5
0
def sim4():
    filename = '../dataset/dataset3.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    train_accuracy = list()
    nodes_numbers = list()
    for i in range(1, 21):
        tree_size, predicted, expected = decision_tree_prediction_and_size(
            dataset, dataset, i, 1)
        acc = accuracy_metric(expected, predicted)
        train_accuracy.append(acc)
        nodes_numbers.append(tree_size)
    x = range(1, 21)
    plt.figure("Simulation n.4")
    line1, = plt.plot(x, train_accuracy, 'r', label='Train accuracy')
    line2, = plt.plot(x, nodes_numbers, 'g', label='Tree complexity(# nodes)')
    plt.legend(handles=[line1, line2], loc=4)
    plt.xticks(x)
    plt.xlabel('Maximum Tree Depth')
    plt.draw()
def parkinson_main(n_folds_outer_cross_val, n_folds_inner_cross_val, max_depth, min_size):

    if n_folds_outer_cross_val < 2 or n_folds_inner_cross_val < 2:
        raise ValueError("Illegal value parameter")

    filename = '../dataset/parkinson_recording_data.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    folds = cross_validation_split(dataset,n_folds_outer_cross_val)
    scores = list()
    outer_fold_number = 0

    # Outer k-fold cross validation
    for fold in folds:
        outer_fold_number += 1
        # Prepare train and test set
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None

        # Inner k-fold cross validation ( grid search )
        best_couple, inner_accuracy = two_grid_search_with_accuracy_return(decision_tree, n_folds_inner_cross_val, train_set, max_depth, min_size)
        # Evaluate results on outer cross validation test set
        predictions = decision_tree(train_set, test_set, best_couple[0], best_couple[1])
        actual = [row[-1] for row in fold]
        outer_accuracy = accuracy_metric(actual, predictions)
        print("-" * 10 + " Outer Fold n. " + str(outer_fold_number) + " " + "-" * 10)
        print("Best params selected by inner cross validation (max_depth,min_size): "+str(best_couple[0])+" "+str(best_couple[1]))
        print("Best params mean accuracy in the inner cross validation: " + str(inner_accuracy))
        print("Best params accuracy in the outer cross validation: " + str(outer_accuracy))
        scores.append(outer_accuracy)

    print("-" * 10 + " Final Results " + " " + "-" * 10)
    print("Total Accuracy mean: " + str(mean(scores)))
    print("Total Accuracy std dev: " + str(stdev(scores)))
    return scores
Beispiel #7
0
def banknote_main(tr_percentages, number_repetitions, n_folds_2grid_search,
                  max_depth, min_size):

    if number_repetitions < 2 or n_folds_2grid_search < 2:
        raise ValueError("Illegal value parameter")

    filename = '../dataset/data_banknote_authentication.csv'
    dataset = load_csv(filename)
    dataset = dataset_to_float(dataset)
    mean_accuracies = list()
    std_devs = list()
    # For each percentage of training split
    for percentage in tr_percentages:
        accuracies = list()
        # Repeat number_repetions times the random split and test validation for each split
        for run in range(1, number_repetitions + 1):
            train, test = random_training_test_split(dataset, percentage)
            result = two_grid_search(decision_tree, n_folds_2grid_search,
                                     train, max_depth, min_size)
            predictions = decision_tree(train, test, result[0], result[1])
            actual = [row[-1] for row in test]
            accuracy = accuracy_metric(actual, predictions)
            accuracies.append(accuracy)
        print("-" * 10 + " training split %" + str(percentage) + " " +
              "-" * 10)
        print("Accuracies of training split %" + str(percentage) + " : " +
              str(accuracies))
        mean_acc = mean(accuracies)
        std_dev = stdev(accuracies)
        print("Accuracy mean: " + str(mean_acc))
        print("Accuracy std dev: " + str(std_dev))
        mean_accuracies.append(mean_acc)
        std_devs.append(std_dev)

    plt.figure("BankNote dataset")
    plot_results(tr_percentages, mean_accuracies, std_devs)
    plt.show()