def run_nfr_cv(n_dim, batch_size, C, alpha, emd_method=emd_samples): global X, P, y, df, X_test reps = {} X_no_p = df.drop(['Y', 'P'], axis=1).values # declare variables X = torch.tensor(X).float() P = torch.tensor(P).long() # train-test split data_train, data_test = split_data_np( (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test X_train_no_p = X_train[:, :-1] X_test_no_p = X_test[:, :-1] X_u = X[P == 1] X_n = X[P == 0] # NFR. model_nfr = FairRep(len(X[0]), n_dim) X = torch.tensor(X).float() P = torch.tensor(P).long() return train_rep(model_nfr, 0.01, X, P, n_iter, c_iter=10, batch_size=batch_size, alpha=alpha, C_reg=C)
def test_in_one(n_dim, batch_size, n_iter, C, alpha, compute_emd=True, k_nbrs=3, emd_method=emd_samples): global X, P, y, df, X_test reps = {} X_no_p = df.drop(['Y', 'P'], axis=1).values # declare variables X = torch.tensor(X).float() P = torch.tensor(P).long() # train-test split data_train, data_test = split_data_np( (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test X_train_no_p = X_train[:, :-1] X_test_no_p = X_test[:, :-1] X_u = X[P == 1] X_n = X[P == 0] # AE. model_ae = FairRep(len(X[0]), n_dim) train_rep(model_ae, 0.01, X, P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # AE_P. model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1) train_rep(model_ae_P, 0.01, X_no_p, P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # NFR. model_name = 'compas_Original' model_nfr = FairRep(len(X[0]), n_dim) X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_nfr, 0.01, X, P, n_iter, 10, batch_size, alpha=alpha, C_reg=0, compute_emd=compute_emd, adv=True, verbose=True) results = {} print('begin testing.') X_ori_np = X.data.cpu().numpy() # Original. print('logistic regression on the original...') lin_model, y_test_scores, performance = get_model_preds( X_train, y_train, P_train, X_test, y_test, P_test, model_name) y_hats[model_name] = get_preds_on_full_dataset(X, lin_model) reps[model_name] = None print(X_train.shape, X_test.shape) save_decision_boundary_plot(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)), np.concatenate((P_train, P_test)), model_name) performance.append(emd_method(X_n, X_u)) performance.append( get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model)) performance.append(equal_odds(X.data.cpu().numpy(), y, P, lin_model)) # make_cal_plot(X.data.cpu().numpy(), y, P, lin_model, model_name) results[model_name] = performance # Original-P. model_name = 'compas_Original-P' print('logistic regression on the original-P') lin_model, y_test_scores, performance = get_model_preds( X_train_no_p, y_train, P_train, X_test_no_p, y_test, P_test, model_name) y_hats[model_name] = get_preds_on_full_dataset(X[:, :-1], lin_model) reps[model_name] = None save_decision_boundary_plot(np.concatenate((X_train_no_p, X_test_no_p)), np.concatenate((y_train, y_test)), np.concatenate((P_train, P_test)), model_name) performance.append(emd_method(X_n[:, :-1], X_u[:, :-1])) print('calculating consistency...') performance.append( get_consistency(X[:, :-1].data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) print('calculating stat diff...') performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model)) performance.append( equal_odds(X[:, :-1].data.cpu().numpy(), y, P, lin_model)) # make_cal_plot(X[:, :-1].data.cpu().numpy(), y, P, lin_model, model_name) results[model_name] = performance # use encoder model_name = 'compas_AE' U_0 = model_ae.encoder(X[P == 0]).data U_1 = model_ae.encoder(X[P == 1]).data U = model_ae.encoder(X).data U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regression on AE...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) save_decision_boundary_plot(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)), np.concatenate((P_train, P_test)), model_name) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) y_hats[model_name] = get_preds_on_full_dataset(U, lin_model) reps[model_name] = U def calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name): print('logistic regression evaluation...') performance = list( evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) print('calculating equal odds...') performance.append(equal_odds(X_test, y_test, P_test, lin_model)) # make_cal_plot(X_test, y_test, P_test, lin_model, model_name) return performance performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name) results[model_name] = (performance) # AE minus P model_name = 'compas_AE_P' U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data U = model_ae_P.encoder(X[:, :-1]).data print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regression on AE-P...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) save_decision_boundary_plot(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)), np.concatenate((P_train, P_test)), model_name) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) y_hats[model_name] = get_preds_on_full_dataset(U, lin_model) reps[model_name] = U performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name) results[model_name] = (performance) model_name = 'compas_NFR' U_0 = model_nfr.encoder(X[P == 0]).data U_1 = model_nfr.encoder(X[P == 1]).data U = model_nfr.encoder(X).data print('nfr emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regression on NFR...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) save_decision_boundary_plot(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)), np.concatenate((P_train, P_test)), model_name) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) y_hats[model_name] = get_preds_on_full_dataset(U, lin_model) reps[model_name] = U performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name) results[model_name] = (performance) return results, y_hats, reps
def test_in_one(n_dim, batch_size, n_iter, C, alpha, compute_emd=True, k_nbrs=3, emd_method=emd_samples): global X, P, y # AE. model_ae = FairRep(len(X[0]), n_dim) #model_ae.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_ae, 0.01, X, P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # AE_P. model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1) #model_ae_P.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_ae_P, 0.01, X[:, :-1], P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # NFR. model_nfr = FairRep(len(X[0]), n_dim) #model_nfr.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_nfr, 0.01, X, P, n_iter, 10, batch_size, alpha=alpha, C_reg=0, compute_emd=compute_emd, adv=True, verbose=True) results = {} print('begin testing.') X_ori_np = X.data.cpu().numpy() # Original. data_train, data_test = split_data_np( (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on the original...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) #print(lin_model.coef_.shape) #int(X_train.shape) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(X_n, X_u)) print('calculating consistency...') performance.append( get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) print('calculating stat diff...') performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model)) results['Original'] = performance # Original-P. data_train, data_test = split_data_np( (X[:, :-1].data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on the original-P') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(X_n[:, :-1], X_u[:, :-1])) print('calculating consistency...') performance.append( get_consistency(X[:, :-1].data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) print('calculating stat diff...') performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model)) results['Original-P'] = (performance) U_0 = model_ae.encoder(X[P == 0]).data U_1 = model_ae.encoder(X[P == 1]).data U = model_ae.encoder(X).data print('ae emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on AE...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['AE'] = (performance) U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data U = model_ae_P.encoder(X[:, :-1]).data print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on AE-P...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['AE_P'] = (performance) U_0 = model_nfr.encoder(X[P == 0]).data U_1 = model_nfr.encoder(X[P == 1]).data U = model_nfr.encoder(X).data print('nfr emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on NFR...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['NFR'] = (performance) return results