def run_eval(dataset): if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) random.seed(1) rounds = 500 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) classifier = AdaFair(n_estimators=rounds, saIndex=sa_index, saValue=p_Group, CSB="CSB1", use_validation=True, debug=True, X_test=X_test, y_test=y_test) classifier.fit(X_train, y_train) plot_per_round(rounds, classifier.performance, classifier.objective, classifier.theta, 'Images/' + dataset + '_per_round_analysis.png')
def run_eval(dataset): if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) base_learners = 200 no_cumul = train_classifier(X, y, sa_index, p_Group, 0, base_learners) cumul = train_classifier(X, y, sa_index, p_Group, 1, base_learners) plot_costs_per_round("Images/Costs/" + dataset, no_cumul, cumul)
def run_eval(dataset, iterations): suffixes = ['NC AdaFair', 'AdaFair'] create_temp_files(dataset, suffixes) if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "dutch": X, y, sa_index, p_Group, x_control = load_dutch_data() elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) create_temp_files(dataset, suffixes) threads = [] mutex = [] for lock in range(0, 2): mutex.append(Lock()) random.seed(int(time.time())) for iter in range(0, iterations): start = time.time() sss = ShuffleSplit(n_splits=2, test_size=0.5) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for proc in range(1, 2): threads.append( Process(target=train_classifier, args=(X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc], proc, 200))) for process in threads: process.start() for process in threads: process.join() threads = [] print("elapsed time = " + str(time.time() - start)) results = [] for suffix in suffixes: infile = open(dataset + suffix, 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_my_results(results, suffixes, "Images/" + dataset + "_single_vs_accum", dataset) delete_temp_files(dataset, suffixes)
def run_eval(dataset, iterations): # suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair', 'SMOTEBoost' ] suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair CSB2', 'AdaFair CSB1' ] if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) create_temp_files(dataset, suffixes) # init parameters for zafar method (default settings) tau = 3.0 mu = 1.2 cons_type = 4 sensitive_attrs = x_control.keys() loss_function = "logreg" EPS = 1e-6 # sensitive_attrs_to_cov_thresh = {sensitive_attrs[0]: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}} sensitive_attrs_to_cov_thresh = {0: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}} cons_params = {"cons_type": cons_type, "tau": tau, "mu": mu, "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh} threads = [] mutex = [] for lock in range(0, 8): mutex.append(Lock()) random.seed(int(time.time())) for iter in range(0, iterations): sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for proc in range(0, 4): if proc < 3 : time.sleep(1) continue if proc > 0: threads.append(Process(target=train_classifier, args=( copy.deepcopy(X_train), X_test, copy.deepcopy(y_train), y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc],proc, 500, 1))) # elif proc == 0: # temp_x_control_train = defaultdict(list) # temp_x_control_test = defaultdict(list) # # temp_x_control_train[sensitive_attrs[0]] = x_control[sensitive_attrs[0]][train_index] # temp_x_control_test[sensitive_attrs[0]] = x_control[sensitive_attrs[0]][test_index] # # x_zafar_train, y_zafar_train, x_control_train = ut.conversion(X[train_index], y[train_index],dict(temp_x_control_train), 1) # # x_zafar_test, y_zafar_test, x_control_test = ut.conversion(X[test_index], y[test_index],dict(temp_x_control_test), 1) # # threads.append(Process(target=train_zafar, args=(x_zafar_train, y_zafar_train, x_control_train, # x_zafar_test, y_zafar_test, x_control_test, # cons_params, loss_function, EPS, # dataset + suffixes[proc], mutex[proc], # sensitive_attrs))) break for process in threads: process.start() for process in threads: process.join() threads = [] results = [] for suffix in suffixes: infile = open(dataset + suffix, 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_my_results(results, suffixes, "Images/" + dataset, dataset) delete_temp_files(dataset, suffixes)
def run_eval(dataset, iterations): suffixes = ['Adaboost', 'AdaFair', 'SMOTEBoost'] if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "dutch": X, y, sa_index, p_Group, x_control = load_dutch_data() elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "credit": X, y, sa_index, p_Group, x_control = load_credit() elif dataset == "diabetes": X, y, sa_index, p_Group, x_control = load_diabetes() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) create_temp_files(dataset, suffixes) threads = [] mutex = [] for lock in range(0, 8): mutex.append(Lock()) print(dataset) random.seed(int(time.time())) for iter in range(0, iterations): sss = StratifiedShuffleSplit(n_splits=1, test_size=.5, random_state=iter) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # for proc in range(0, 3): # threads.append(Process(target=train_classifier, args=( X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc],proc, 500, 1, dataset))) threads.append( Process(target=train_classifier, args=(X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[1], mutex[1], 1, 500, 1, dataset))) break for process in threads: process.start() for process in threads: process.join() results = [] for suffix in suffixes: infile = open(dataset + suffix, 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_my_results(results, suffixes, "Images/EqualOpportunity/" + dataset, dataset) delete_temp_files(dataset, suffixes)
def run_eval(dataset, iterations): if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "dutch": X, y, sa_index, p_Group, x_control = load_dutch_data() elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) suffixes = ['AdaFair NoConf.', 'AdaFair'] random.seed(int(time.time())) base_learners = 500 steps = numpy.arange(0, 1.001, step=0.2) create_temp_files(dataset, suffixes, steps) threads = [] mutex = [] for lock in range(0, 2): mutex.append(Lock()) for iterations in range(0, iterations): start = time.time() sss = StratifiedShuffleSplit(n_splits=1, test_size=.5, random_state=iterations) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for c in steps: threads.append( Process(target=train_classifier, args=(X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[1], mutex[1], base_learners, c))) break for process in threads: process.start() for process in threads: process.join() threads = [] print("elapsed time = " + str(time.time() - start)) results = [] for suffix in suffixes: infile = open(dataset + suffix + "_dm", 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_results_of_c_impact(results[0], results[1], steps, "Images/Impact_c/", dataset) delete_temp_files(dataset, suffixes)
def run_eval(dataset): if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) base_learners = 200 adaboost, adaboost_weights, init_weights = train_classifier( X, y, sa_index, p_Group, 0, base_learners) csb1, csb1_weights, temp = train_classifier(X, y, sa_index, p_Group, 1, base_learners) csb2, csb2_weights, temp = train_classifier(X, y, sa_index, p_Group, 2, base_learners) adaboost *= y csb1 *= y csb2 *= y csb1_positives = csb1[y == 1] csb1_negatives = csb1[y == -1] csb2_positives = csb2[y == 1] csb2_negatives = csb2[y == -1] adaboost_positives = adaboost[y == 1] adaboost_negatives = adaboost[y == -1] num_bins = 50 fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 3)) # fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14,3)) plt.rcParams.update({'font.size': 11}) ax1.set_title("Positive CDF") ax1.grid(True) counts_ada_positives, bin_edges_ada_positives = numpy.histogram( adaboost_positives, bins=num_bins, normed=True) cdf_ada_positives = numpy.cumsum(counts_ada_positives) ax1.plot(bin_edges_ada_positives[1:], cdf_ada_positives / cdf_ada_positives[-1], c='blue', label='AdaBoost') counts_csb1_positives, bin_edges_csb1_positives = numpy.histogram( csb1_positives, bins=num_bins, normed=True) cdf_csb1_positives = numpy.cumsum(counts_csb1_positives) ax1.plot(bin_edges_csb1_positives[1:], cdf_csb1_positives / cdf_csb1_positives[-1], c='green', linestyle='-.', label='AdaFair NoConf') counts_csb2_positives, bin_edges_csb2_positives = numpy.histogram( csb2_positives, bins=num_bins, normed=True) cdf_csb2_positives = numpy.cumsum(counts_csb2_positives) ax1.plot(bin_edges_csb2_positives[1:], cdf_csb2_positives / cdf_csb2_positives[-1], c='red', linestyle='--', label='AdaFair') ax1.legend(loc='best') ax1.set_xlabel("Margin") ax1.set_ylabel("Cumulative Distribution") ax1.axhline(0, color='black') ax1.axvline(0, color='black') ax2.grid(True) ax2.axhline(0, color='black') ax2.axvline(0, color='black') ax2.set_title("Negative CDF") counts_ada_negatives, bin_edges_ada_negatives = numpy.histogram( adaboost_negatives, bins=num_bins, normed=True) cdf_ada_negatives = numpy.cumsum(counts_ada_negatives) ax2.plot(bin_edges_ada_negatives[1:], cdf_ada_negatives / cdf_ada_negatives[-1], c='blue', label='AdaBoost') ax2.set_ylabel("Cumulative Distribution") ax2.set_xlabel("Margin") counts_csb1_negatives, bin_edges_csb1_negatives = numpy.histogram( csb1_negatives, bins=num_bins, normed=True) cdf_csb1_negatives = numpy.cumsum(counts_csb1_negatives) ax2.plot(bin_edges_csb1_negatives[1:], cdf_csb1_negatives / cdf_csb1_negatives[-1], c='green', linestyle='-.', label='AdaFair NoConf') counts_csb2_negatives, bin_edges_csb2_negatives = numpy.histogram( csb2_negatives, bins=num_bins, normed=True) cdf_csb2_negatives = numpy.cumsum(counts_csb2_negatives) ax2.plot(bin_edges_csb2_negatives[1:], cdf_csb2_negatives / cdf_csb2_negatives[-1], c='red', linestyle='--', label='AdaFair') ax2.legend(loc='best') # index = numpy.arange(4) # bar_width = 0.2 # # adaboost_weights = adaboost_weights.split(",") # init_weights = init_weights.split(",") # csb1_weights = csb1_weights.split(",") # csb2_weights = csb2_weights.split(",") # # ax3.set_title("Weights per group") # # ax3.set_ylabel("(%)") # # # prot_pos = [float(init_weights[4]), float(adaboost_weights[4]), float(csb1_weights[4]), float(csb2_weights[4])] # non_prot_pos = [float(init_weights[5]), float(adaboost_weights[5]), float(csb1_weights[5]), float(csb2_weights[5])] # prot_neg = [float(init_weights[6]), float(adaboost_weights[6]), float(csb1_weights[6]), float(csb2_weights[6])] # non_prot_neg = [float(init_weights[7]), float(adaboost_weights[7]), float(csb1_weights[7]), float(csb2_weights[7])] # # ax3.bar(index, prot_pos,label='Prot. Pos.', edgecolor='black', width= bar_width) # ax3.bar(index, non_prot_pos,label='Non-Prot. Pos.', bottom=prot_pos, edgecolor='red', width= bar_width) # ax3.bar(index, prot_neg,label='Prot. Neg.', bottom=[i+j for i,j in zip(prot_pos, non_prot_pos)], edgecolor='green', width= bar_width) # ax3.bar(index, non_prot_neg,label='Non-Prot. Neg.', bottom=[i+j+z for i,j,z in zip(prot_pos, non_prot_pos, prot_neg)], edgecolor='blue', width= bar_width) # # # # ax3.set_xticks([0 , 1 , 2 , 3 ]) # ax3.grid(True) # # ax3.set_xticklabels(['Initial Weights','AdaBoost', 'AdaFair NoConf.', 'AdaFair']) # ax3.legend(loc='best', fancybox=True, framealpha=0.1) # plt.yticks(numpy.arange(0, 1.0001, step=0.1)) # ax3.set_ylim([0.48, 0.52]) # plt.rcParams.update({'font.size': 9}) fig.tight_layout() plt.show() plt.legend(loc='best', fancybox=True, framealpha=0.2) plt.savefig("Images/cdf_" + dataset + ".png")
def run_eval(dataset): if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) base_learners = 200 adaboost, adaboost_weights, init_weights = train_classifier( X, y, sa_index, p_Group, 0, base_learners) csb1, csb1_weights, temp = train_classifier(X, y, sa_index, p_Group, 1, base_learners) csb2, csb2_weights, temp = train_classifier(X, y, sa_index, p_Group, 2, base_learners) adaboost *= y csb1 *= y csb2 *= y csb1_positives = csb1[y == 1] csb1_negatives = csb1[y == -1] csb2_positives = csb2[y == 1] csb2_negatives = csb2[y == -1] adaboost_positives = adaboost[y == 1] adaboost_negatives = adaboost[y == -1] num_bins = 50 fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 3)) # fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14,3)) plt.rcParams.update({'font.size': 11}) ax1.set_title("Positive CDF") ax1.grid(True) counts_ada_positives, bin_edges_ada_positives = numpy.histogram( adaboost_positives, bins=num_bins, normed=True) cdf_ada_positives = numpy.cumsum(counts_ada_positives) ax1.plot(bin_edges_ada_positives[1:], cdf_ada_positives / cdf_ada_positives[-1], c='blue', label='AdaBoost') counts_csb1_positives, bin_edges_csb1_positives = numpy.histogram( csb1_positives, bins=num_bins, normed=True) cdf_csb1_positives = numpy.cumsum(counts_csb1_positives) ax1.plot(bin_edges_csb1_positives[1:], cdf_csb1_positives / cdf_csb1_positives[-1], c='green', linestyle='-.', label='AdaFair NoConf') counts_csb2_positives, bin_edges_csb2_positives = numpy.histogram( csb2_positives, bins=num_bins, normed=True) cdf_csb2_positives = numpy.cumsum(counts_csb2_positives) ax1.plot(bin_edges_csb2_positives[1:], cdf_csb2_positives / cdf_csb2_positives[-1], c='red', linestyle='--', label='AdaFair') ax1.legend(loc='best') ax1.set_xlabel("Margin") ax1.set_ylabel("Cumulative Distribution") ax1.axhline(0, color='black') ax1.axvline(0, color='black') ax2.grid(True) ax2.axhline(0, color='black') ax2.axvline(0, color='black') ax2.set_title("Negative CDF") counts_ada_negatives, bin_edges_ada_negatives = numpy.histogram( adaboost_negatives, bins=num_bins, normed=True) cdf_ada_negatives = numpy.cumsum(counts_ada_negatives) ax2.plot(bin_edges_ada_negatives[1:], cdf_ada_negatives / cdf_ada_negatives[-1], c='blue', label='AdaBoost') ax2.set_ylabel("Cumulative Distribution") ax2.set_xlabel("Margin") counts_csb1_negatives, bin_edges_csb1_negatives = numpy.histogram( csb1_negatives, bins=num_bins, normed=True) cdf_csb1_negatives = numpy.cumsum(counts_csb1_negatives) ax2.plot(bin_edges_csb1_negatives[1:], cdf_csb1_negatives / cdf_csb1_negatives[-1], c='green', linestyle='-.', label='AdaFair NoConf') counts_csb2_negatives, bin_edges_csb2_negatives = numpy.histogram( csb2_negatives, bins=num_bins, normed=True) cdf_csb2_negatives = numpy.cumsum(counts_csb2_negatives) ax2.plot(bin_edges_csb2_negatives[1:], cdf_csb2_negatives / cdf_csb2_negatives[-1], c='red', linestyle='--', label='AdaFair') ax2.legend(loc='best') fig.tight_layout() plt.show() plt.legend(loc='best', fancybox=True, framealpha=0.2) plt.savefig("Images/cdf_" + dataset + "_sp.png")