def __init__(self, image_file, label_file, patch_size=(101, 101)): self.input = prepared_dataset_image(image_file, border=patch_size) self.image_size = image_size(prepared_dataset_image(image_file)) self.patch_size = patch_size width, height = self.image_size self.output = np.zeros((height, width)) self.verbose = TT.verbose for (col, row, p) in load_csv(label_file): self.output[row, col] = 1.0
def sim3(): filename = '../dataset/dataset3.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) plot_2D_dataset(dataset, "Simulation n.3") tree = build_tree(dataset, 5, 1) print("-" * 10 + " Sim.3 TREE " + "-" * 10) print_tree(tree.root, ['x', 'y']) print_tree_separating_2D(tree.root) show_plot()
def main(): start_season = 1995 end_season = 2020 regular_season = True # Only need to update drafts once a year if False: drafts = util.get_drafts(start_season, end_season) # Takes a loonnnggg time to run util.save_csv("drafts.csv", drafts) else: drafts = util.load_csv("drafts.csv") if True: players = get_career_stats(start_season, end_season, regular_season) util.save_csv("players.csv", players) rosters = util.get_rosters() util.save_csv("rosters.csv", rosters) else: players = util.load_csv("players.csv") rosters = util.load_csv("rosters.csv") drafts = update_team_names(drafts) drafts = drafts.sort_values(["team.name", "year", "round"], ascending=[1, 0, 1]) # Merge all data into one dataframe drafts['name_lower'] = drafts['prospect.fullName'].str.lower() players['name_lower'] = players['playerName'].str.lower() rosters['name_lower'] = rosters['fullName'].str.lower() draft_data = pd.merge(drafts, players, how="left", on="name_lower", sort=False, suffixes=("", "_x")) draft_data = pd.merge(draft_data, rosters, how="left", on="name_lower", sort=False, suffixes=("", "_y")) # Update positions and set statuses for each filter in the visuzalization. Then get rid of unneeded columns. draft_data = set_statuses(draft_data) draft_data = clean_data(draft_data) draft_data = reduce_columns(draft_data) util.save_csv("draft_data.csv", draft_data)
def positive(self): if hasattr(self, '_positive'): return self._positive, self._positive_size TT.debug("Collecting positive samples.") self._positive = {} self._positive_size = 0 self._positive_expanded = {} for data_file, label_file in self.files: labels = load_csv(os.path.join(self.root_path, label_file)) self._positive[data_file] = labels self._positive_size += len(labels) self._positive_expanded[data_file] = {} for col, row, p in labels: self._positive_expanded[data_file][index_at_pixel(col=col, row=row, size=self.image_size)] = p TT.debug("Found", self._positive_size, "positive samples.") return self.positive
def sim4(): filename = '../dataset/dataset3.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) train_accuracy = list() nodes_numbers = list() for i in range(1, 21): tree_size, predicted, expected = decision_tree_prediction_and_size( dataset, dataset, i, 1) acc = accuracy_metric(expected, predicted) train_accuracy.append(acc) nodes_numbers.append(tree_size) x = range(1, 21) plt.figure("Simulation n.4") line1, = plt.plot(x, train_accuracy, 'r', label='Train accuracy') line2, = plt.plot(x, nodes_numbers, 'g', label='Tree complexity(# nodes)') plt.legend(handles=[line1, line2], loc=4) plt.xticks(x) plt.xlabel('Maximum Tree Depth') plt.draw()
def parkinson_main(n_folds_outer_cross_val, n_folds_inner_cross_val, max_depth, min_size): if n_folds_outer_cross_val < 2 or n_folds_inner_cross_val < 2: raise ValueError("Illegal value parameter") filename = '../dataset/parkinson_recording_data.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) folds = cross_validation_split(dataset,n_folds_outer_cross_val) scores = list() outer_fold_number = 0 # Outer k-fold cross validation for fold in folds: outer_fold_number += 1 # Prepare train and test set train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None # Inner k-fold cross validation ( grid search ) best_couple, inner_accuracy = two_grid_search_with_accuracy_return(decision_tree, n_folds_inner_cross_val, train_set, max_depth, min_size) # Evaluate results on outer cross validation test set predictions = decision_tree(train_set, test_set, best_couple[0], best_couple[1]) actual = [row[-1] for row in fold] outer_accuracy = accuracy_metric(actual, predictions) print("-" * 10 + " Outer Fold n. " + str(outer_fold_number) + " " + "-" * 10) print("Best params selected by inner cross validation (max_depth,min_size): "+str(best_couple[0])+" "+str(best_couple[1])) print("Best params mean accuracy in the inner cross validation: " + str(inner_accuracy)) print("Best params accuracy in the outer cross validation: " + str(outer_accuracy)) scores.append(outer_accuracy) print("-" * 10 + " Final Results " + " " + "-" * 10) print("Total Accuracy mean: " + str(mean(scores))) print("Total Accuracy std dev: " + str(stdev(scores))) return scores
def banknote_main(tr_percentages, number_repetitions, n_folds_2grid_search, max_depth, min_size): if number_repetitions < 2 or n_folds_2grid_search < 2: raise ValueError("Illegal value parameter") filename = '../dataset/data_banknote_authentication.csv' dataset = load_csv(filename) dataset = dataset_to_float(dataset) mean_accuracies = list() std_devs = list() # For each percentage of training split for percentage in tr_percentages: accuracies = list() # Repeat number_repetions times the random split and test validation for each split for run in range(1, number_repetitions + 1): train, test = random_training_test_split(dataset, percentage) result = two_grid_search(decision_tree, n_folds_2grid_search, train, max_depth, min_size) predictions = decision_tree(train, test, result[0], result[1]) actual = [row[-1] for row in test] accuracy = accuracy_metric(actual, predictions) accuracies.append(accuracy) print("-" * 10 + " training split %" + str(percentage) + " " + "-" * 10) print("Accuracies of training split %" + str(percentage) + " : " + str(accuracies)) mean_acc = mean(accuracies) std_dev = stdev(accuracies) print("Accuracy mean: " + str(mean_acc)) print("Accuracy std dev: " + str(std_dev)) mean_accuracies.append(mean_acc) std_devs.append(std_dev) plt.figure("BankNote dataset") plot_results(tr_percentages, mean_accuracies, std_devs) plt.show()