def run_eval(dataset, iterations): suffixes = [ 'Adaboost', 'AdaFair', 'SMOTEBoost', 'Fish et al.' ] if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "dutch": X, y, sa_index, p_Group, x_control = load_dutch_data() elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "credit": X, y, sa_index, p_Group, x_control = load_credit() elif dataset == "diabetes": X, y, sa_index, p_Group, x_control = load_diabetes() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) create_temp_files(dataset, suffixes) threads = [] mutex = [] for lock in range(0, 8): mutex.append(Lock()) print (dataset) random.seed(int(time.time())) for iter in range(0, iterations): sss = StratifiedShuffleSplit(n_splits=1, test_size=.5, random_state=iter) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # for proc in range(0, 4): # threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc],proc, 200, 1, dataset))) threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[1], mutex[1],1, 500, 1, dataset))) break for process in threads: process.start() for process in threads: process.join() results = [] for suffix in suffixes: infile = open(dataset + suffix, 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_my_results_sp(results, suffixes, "Images/StatisticalParity/" + dataset, dataset) delete_temp_files(dataset, suffixes)
def load_datasets(dataset, names): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 names.add(dataset) output = [] output.append(X.shape[0]) output.append(X.shape[1]) output.append(float(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f'))) return output
def get_dataset(dataset): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) elif dataset not in ['bloob', 'circle', 'moon']: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 return X, y, cl_names
def run_eval(dataset, base_learners, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 list_of_scores = [] processes = [] for method in methods: p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names)) # Passing the list p.start() processes.append(p) for p in processes: p.join() for method in methods: with open('temp/' + method, 'rb') as filehandle: # read the data as binary data stream list_of_scores.append(pickle.load(filehandle)) y[y != 1] = -1 for idx in range(0, len(list_of_scores)): list_of_scores[idx] = numpy.array(list_of_scores[idx]) * y overall_confs = [] positive_confs = [] negative_confs = [] for conf in list_of_scores: overall_confs.append(conf) positive_confs.append(conf[y == 1]) negative_confs.append(conf[y == -1]) num_bins = 40 fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 4)) plt.rcParams.update({'font.size': 12}) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato'] default_cycler = (cycler(color=colors) + cycler(linestyle=['-', (0, (1, 1)), '--', '-.', (0, (5, 10)), (0, (5, 1)), '-', (0, (1, 1)), '--', '-.', (0, (5, 10))])) ax1.set_prop_cycle(default_cycler) ax2.set_prop_cycle(default_cycler) ax3.set_prop_cycle(default_cycler) ax1.set_title("Positive CDF") ax1.grid(True) ax1.set_xlim(-1, 1) ax2.set_xlim(-1, 1) ax3.set_xlim(-1, 1) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato', 'indigo', 'lightskyblue'] output = defaultdict(list) for idx in range(0, len(positive_confs)): pos_conf = positive_confs[idx] counts_positives, bin_edges_positives = numpy.histogram(pos_conf, bins=num_bins, normed=True) cdf_positives = numpy.cumsum(counts_positives) # ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx],color=colors[idx]) ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_positives[:-1]) output[methods[idx]].append(cdf_positives) # ax1.legend(loc='best') ax1.set_xlabel("Margin") ax1.set_ylabel("Cumulative Distribution") ax1.axhline(0, color='black') ax1.axvline(0, color='black') ax2.grid(True) ax2.axhline(0, color='black') ax2.axvline(0, color='black') ax2.set_title("Negative CDF") for idx in range(0, len(negative_confs)): if idx == 0: ax2.set_ylabel("Cumulative Distribution") ax2.set_xlabel("Margin") neg_conf = negative_confs[idx] counts_negatives, bin_edges_negatives = numpy.histogram(neg_conf, bins=num_bins, normed=True) cdf_negatives = numpy.cumsum(counts_negatives) # ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx],color=colors[idx]) ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_negatives[:-1]) output[methods[idx]].append(cdf_negatives) ax3.grid(True) ax3.axhline(0, color='black') ax3.axvline(0, color='black') ax3.set_title("Overall CDF") for idx in range(0, len(negative_confs)): if idx == 0: ax3.set_ylabel("Cumulative Distribution") ax3.set_xlabel("Margin") over_conf = overall_confs[idx] counts_overall, bin_edges_overall = numpy.histogram(over_conf, bins=num_bins, normed=True) cdf_overall = numpy.cumsum(counts_overall) # ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx], color=colors[idx]) ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_overall[:-1]) output[methods[idx]].append(cdf_overall) plt.legend(loc='upper center', bbox_to_anchor=(-0.7, 1.305), ncol=5) if not os.path.exists("Images/cdf_plots/" + dataset): os.makedirs("Images/cdf_plots/" + dataset) plt.savefig("Images/cdf_plots/" + dataset + "/cdf_" + str(base_learners) + ".png", bbox_inches='tight', dpi=200) return output
def run_eval(dataset, baseL, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 print("===============-- " + dataset + " --===============") processes = [] for method in methods: p = Process(target=train_and_predict, args=(X, y, baseL, method)) p.start() processes.append(p) for p in processes: p.join() list_of_dicts = [] for method in methods: with open('temp_preds_adaac/' + method, 'rb') as filehandle: list_of_dicts.append(update_stats(pickle.load(filehandle))) plot_amort_vs_non_amort(methods, list_of_dicts, baseL, "Images/Amort_vs_non_amort/" + dataset + "/") return list_of_dicts
def run_eval(dataset, folds, iterations, baseL, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "rain_aus": X, y, cl_names = load_rain_aus() elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 unique_attr = set([i.split("?")[0] for i in cl_names]) print(dataset + "\t" + str(len(unique_attr)) + "\t" + str(f'{sum(abs(y[y == 1])):,}') + "\t" + str( f'{len(abs(y[y != 1])):,}') + "\t1:" + str(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f'))) list_of_dicts = [] list_of_dicts_stats = [] for t_dict in range(0, len(methods)): list_of_dicts.append(defaultdict(dict)) list_of_dicts_stats.append(defaultdict(dict)) for weak_learners in baseL: for item in list_of_dicts: item[weak_learners] = defaultdict(list) for weak_learners in baseL: for item in list_of_dicts_stats: item[weak_learners] = defaultdict(list) for samples in range(0, iterations): sss = StratifiedKFold(n_splits=folds, shuffle=True, random_state=int(time.time())) for weak_learners in baseL: print("iteration=", samples, " weak learners=", weak_learners) # for weak_learners in baseL: for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] processes = [] for method in methods: p = Process(target=train_and_predict, args=(X_train, y_train, X_test, weak_learners, method, cl_names)) p.start() processes.append(p) for p in processes: p.join() for index, method in enumerate(methods): with open('temp_preds/' + method, 'rb') as filehandle: list_of_dicts[index] = update_performance_stats( calculate_performance(y_test, pickle.load(filehandle)), list_of_dicts[index], weak_learners ) with open('temp_preds/stats_' + method, 'rb') as filehandle: list_of_dicts_stats[index] = update_resource_stats(pickle.load(filehandle), list_of_dicts_stats[index], weak_learners, method ) plot_single_dataset(methods, list_of_dicts, "Images/Performance/" + dataset + "/", baseL) plot_resource_stats_time(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL) plot_resource_stats_scores(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL) return list_of_dicts, list_of_dicts_stats
def run_eval(dataset, base_learners, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 processes = [] for method in methods: p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names)) # Passing the list p.start() processes.append(p) for p in processes: p.join() N = len(methods) ind = numpy.arange(N) # the x locations for the groups width = 0.35 # the width of the bars: can also be len(x) sequence raw_data = dict() for method in methods: with open('temp_features/' + method, 'rb') as filehandle: # read the data as binary data stream model = pickle.load(filehandle) # print (method, model.feature_importances_) raw_data[method] = model.feature_importances_ f_num = len(model.feature_importances_) index = ["Feature " + str(k) for k in range(1, f_num + 1)] # index = ["Atrribute 1","Atrribute 2","Atrribute 3","Atrribute 4","Atrribute 5","Atrribute 6"] df = pd.DataFrame(raw_data, index=index) df = df.transpose() ax = df.plot.bar(stacked=True, alpha=0.75, rot=25) ax.set_ylabel("Feature importance") ax.set_xlabel("Methods") ax.legend(loc='center left', bbox_to_anchor=(0.1, 01.07), ncol=3) # here is the magic ax.figure.savefig('Images/features/' + dataset + '.png', bbox_inches='tight', dpi=200)