def reduce_and_predict(df1, df2, epsilon): # concatenate the data and clean it df = pandas.concat([df1, df2]) ntrain = len(df1) ntest = len(df2) label_names = ['Y'] protected_attribute_names = ['A'] train_data = df.head(ntrain) test_data = df.tail(ntest) dataX = train_data.drop(columns=['Y', 'A']) dataA = train_data['A'] dataY = train_data['Y'] X_test = test_data.drop(columns=['Y', 'A']) learner = LogisticRegression() #moments = reload(moments) #red = reload(red) results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon) best_class = results_tuple.best_classifier Y_hat = best_class(X_test) Y_hat = Y_hat.to_frame() Y_hat = np.float64(Y_hat.as_matrix().ravel()) return Y_hat
def run_eps_single(eps, x, a, y): learner0 = RegressionLearner() # directly running oracle for comparison learner0.fit(x, y, np.ones(len(y))) pred0 = learner0.predict(x) err0 = sum(np.abs(y - pred0)) / len(y) learner = RegressionLearner() a_prime = a.copy(deep=True) binarize_sens_attr(a_prime) a_prime[y == 1] = 0 sens_attr = list(a.columns) res_tuple = red.expgrad(x, a_prime, y, learner, cons=marginal_EO(sens_attr), eps=eps, debug=False) predictions = 1 * (weighted_predictions(res_tuple, x) > 0.5) return predictions
def fit(self, dataset): # sanity checks assert (len(self.privileged_group) == 1) assert ((self.privileged_group[0].keys() == self.unprivileged_group[0].keys())) class_attr = list(self.privileged_group[0].keys())[0] reg = LogisticRegression(solver='liblinear', max_iter=1000000000) class_ind = dataset.feature_names.index(class_attr) X = pd.DataFrame(dataset.features) A = pd.Series(dataset.features[:, class_ind]) Y = pd.Series(dataset.labels.ravel()) bc = expgrad(X, A, Y, reg, nu=1, cons=DP()).best_classifier bc_binary = lambda x: (list(bc(x) > 0.5)) self.bc = bc self.bc_binary = bc_binary
def reduce_and_predict(df1, df2, epsilon): # concatenate the data and clean it df = pandas.concat([df1, df2]) ntrain = len(df1) ntest = len(df2) ''' df = pandas.read_csv("compas.csv") ntrain = 5000 ntest = 2214 epsilon = 0.01 ''' df = pandas.get_dummies(df, prefix=['sex', 'race', 'c_charge_degree'], drop_first=True) df = df.rename( columns={ 'race_Non-White': 'race', 'sex_Male': 'sex', 'c_charge_degree_M': 'charge_degree' }) df = df.astype('int64') # set up the BinaryLabelDataset label_names = ['two_year_recid'] protected_attribute_names = ['race'] train_data = df.head(ntrain) test_data = df.tail(ntest) dataX = train_data.drop(columns=['two_year_recid', 'race']) dataA = train_data['race'] dataY = train_data['two_year_recid'] X_test = test_data.drop(columns=['two_year_recid', 'race']) learner = LogisticRegression() results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon) best_class = results_tuple.best_classifier Y_hat = best_class(X_test) >= 0.5 Y_hat = Y_hat.to_frame() Y_hat = np.int64(Y_hat.as_matrix().ravel()) return Y_hat
def run_eps_list_FP(eps_list, x, a, y): learner = RegressionLearner() a_prime = a.copy(deep=True) binarize_sens_attr(a_prime) a_prime[y == 1] = 0 # hack: setting protected attrs to be 0 for # negative examples sens_attr = list(a_prime.columns) gamma_values = {} err_values = {} eps_values = {} for eps in eps_list: res_tuple = red.expgrad(x, a_prime, y, learner, cons=marginal_EO(sens_attr), eps=eps) weighted_pred = weighted_predictions(res_tuple, x) err_values[eps] = sum(np.abs(y - weighted_pred)) / len(y) # err eps_values[eps] = compute_FP(a_prime, y, weighted_pred) print(eps_values[eps]) d = {'err' : list(err_values.values()), 'input eps' : eps_list, 'empirical eps' : list(eps_values.values())} return pd.DataFrame(data=d)
def reduce_and_predict(df1, df2, epsilon): # concatenate the data and clean it df = pandas.concat([df1, df2]) ntrain = len(df1) ntest = len(df2) ''' df = pandas.read_csv("UCIAdult.csv") ntrain = 18000 ntest = 7839 df.drop(columns = ['Unnamed: 0']) ''' df = pandas.get_dummies(df, prefix = ['income', 'sex', 'native_country', 'marital_status',\ 'workclass', 'occupation'], drop_first = True) df = df.rename(columns = {'income_>50K':'income', 'sex_Female':'sex', 'native_country_United-States':'native_country',\ 'marital_status_Not-Married':'marital_status'}) df = df.astype('int64') label_names = ['income'] protected_attribute_names = ['sex'] train_data = df.head(ntrain) test_data = df.tail(ntest) dataX = train_data.drop(columns=['income', 'sex']) dataA = train_data['sex'] dataY = train_data['income'] X_test = test_data.drop(columns=['income', 'sex']) learner = LogisticRegression() #moments = reload(moments) #red = reload(red) results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon) best_class = results_tuple.best_classifier Y_hat = best_class(X_test) >= 0.5 Y_hat = Y_hat.to_frame() Y_hat = np.int64(Y_hat.as_matrix().ravel()) return Y_hat
score, _ = ad.audit_tree(test, feature_audit, 'predict', protected) #print(unfair_treatment[unfair_treatment.race == 'Caucasian'][feature_audit + ['predict', outcome]].describe()) #print(unfair_treatment[unfair_treatment.race == 'African-American'][feature_audit + ['predict', outcome]].describe()) print(score) # reduction method epsilon = 0.01 constraint = moments.EO() trainX = train[feature_list] trainY = train[outcome] trainA = train['attr'] logreg = LogisticRegression() dct = DecisionTreeClassifier() res_tuple = red.expgrad(trainX, trainA, trainY, dct, cons=constraint, eps=epsilon) res = res_tuple._asdict() best_classifier = res["best_classifier"] test['predict'] = np.array(best_classifier(np.array(test[feature_list]))) test.loc[test.predict < 0.5, 'predict'] = 0 test.loc[test.predict > 0.5, 'predict'] = 1 # auditing learner feature_audit = ['ugpa', 'cluster', 'lsat', 'fam_inc'] score, _ = ad.audit_tree(test, feature_audit, 'predict', protected) #print(unfair_treatment[unfair_treatment.sex == 'Caucasian'][feature_audit + ['predict', outcome]].describe()) #print(unfair_treatment[unfair_treatment.race == 'African-American'][feature_audit + ['predict', outcome]].describe()) print(score)
attrs = [str(x) for x in 'AAAAAAA' 'BBBBBBB' 'CCCCCC'] labls = [int(x) for x in '0110100' '0010111' '001111'] feat1 = [int(x) for x in '0110101' '0111101' '001011'] feat2 = [int(x) for x in '0000100' '0000011' '111111'] feat3 = [int(x) for x in '1111111' '1111111' '111111'] dataX = pd.DataFrame({"feat1": feat1, "feat2": feat2, "feat3": feat3}) dataY = pd.Series(labls) dataA = pd.Series(attrs) learner = LeastSquaresLearner() for test in tests: res_tuple = red.expgrad(dataX, dataA, dataY, learner, cons=test["cons_class"](), eps=test["eps"]) res = res_tuple._asdict() Q = res["best_classifier"] res["n_classifiers"] = len(res["classifiers"]) disp = test["cons_class"]() disp.init(dataX, dataA, dataY) error = moments.MisclassError() error.init(dataX, dataA, dataY) res["disp"] = disp.gamma(Q).max() res["error"] = error.gamma(Q)[0] report_header = "testing (%s, eps=%.3f)" \
#gmm.fit(trainXA) #train['cluster2'] = gmm.predict(trainXA) mbk = MiniBatchKMeans(init='k-means++', n_clusters=10, batch_size=batch_size, n_init=10, max_no_improvement=20, verbose=0, random_state=1, reassignment_ratio=0.1) mbk.fit(trainXA[:, :-2]) mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0) train['cluster2'] = pairwise_distances_argmin(trainXA[:, :-2], mbk_means_cluster_centers) # bias detecting from fairlearn learner = LogisticRegression() res_tuple = red.expgrad(trainX, trainA, trainY, learner, cons=cons, eps=epsilon) res = res_tuple._asdict() best_classifier = res["best_classifier"] predicted = best_classifier(trainX) train['predicted'] = predicted bias = train.groupby('cluster').predicted.mean() bias = bias * ( 1 - bias) print(bias) bias = train.groupby('cluster')[outcome].mean() bias = bias * ( 1 - bias) print(bias) bias = train.groupby('cluster2').predicted.mean()