Ejemplo n.º 1
0
def reduce_and_predict(df1, df2, epsilon):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = len(df1)
    ntest = len(df2)

    label_names = ['Y']
    protected_attribute_names = ['A']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    dataX = train_data.drop(columns=['Y', 'A'])
    dataA = train_data['A']
    dataY = train_data['Y']

    X_test = test_data.drop(columns=['Y', 'A'])
    learner = LogisticRegression()

    #moments = reload(moments)
    #red = reload(red)
    results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon)
    best_class = results_tuple.best_classifier

    Y_hat = best_class(X_test)
    Y_hat = Y_hat.to_frame()
    Y_hat = np.float64(Y_hat.as_matrix().ravel())

    return Y_hat
Ejemplo n.º 2
0
def run_eps_single(eps, x, a, y):
    learner0 = RegressionLearner()
    # directly running oracle for comparison
    learner0.fit(x, y, np.ones(len(y)))
    pred0 = learner0.predict(x)
    err0 = sum(np.abs(y - pred0)) / len(y)

    learner = RegressionLearner()
    a_prime = a.copy(deep=True)
    binarize_sens_attr(a_prime)
    a_prime[y == 1] = 0
    sens_attr = list(a.columns)
    res_tuple = red.expgrad(x, a_prime, y, learner,
                            cons=marginal_EO(sens_attr), eps=eps,
                            debug=False)
    predictions = 1 * (weighted_predictions(res_tuple, x) > 0.5)

    return predictions
Ejemplo n.º 3
0
    def fit(self, dataset):
        # sanity checks
        assert (len(self.privileged_group) == 1)
        assert ((self.privileged_group[0].keys() ==
                 self.unprivileged_group[0].keys()))
        class_attr = list(self.privileged_group[0].keys())[0]

        reg = LogisticRegression(solver='liblinear', max_iter=1000000000)

        class_ind = dataset.feature_names.index(class_attr)
        X = pd.DataFrame(dataset.features)
        A = pd.Series(dataset.features[:, class_ind])
        Y = pd.Series(dataset.labels.ravel())

        bc = expgrad(X, A, Y, reg, nu=1, cons=DP()).best_classifier
        bc_binary = lambda x: (list(bc(x) > 0.5))

        self.bc = bc
        self.bc_binary = bc_binary
Ejemplo n.º 4
0
def reduce_and_predict(df1, df2, epsilon):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = len(df1)
    ntest = len(df2)
    '''
  df = pandas.read_csv("compas.csv")
  ntrain = 5000
  ntest = 2214
  epsilon = 0.01
  '''
    df = pandas.get_dummies(df,
                            prefix=['sex', 'race', 'c_charge_degree'],
                            drop_first=True)
    df = df.rename(
        columns={
            'race_Non-White': 'race',
            'sex_Male': 'sex',
            'c_charge_degree_M': 'charge_degree'
        })
    df = df.astype('int64')
    # set up the BinaryLabelDataset
    label_names = ['two_year_recid']
    protected_attribute_names = ['race']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    dataX = train_data.drop(columns=['two_year_recid', 'race'])
    dataA = train_data['race']
    dataY = train_data['two_year_recid']
    X_test = test_data.drop(columns=['two_year_recid', 'race'])
    learner = LogisticRegression()

    results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon)
    best_class = results_tuple.best_classifier

    Y_hat = best_class(X_test) >= 0.5
    Y_hat = Y_hat.to_frame()
    Y_hat = np.int64(Y_hat.as_matrix().ravel())

    return Y_hat
Ejemplo n.º 5
0
def run_eps_list_FP(eps_list, x, a, y):
    learner = RegressionLearner()
    a_prime = a.copy(deep=True)
    binarize_sens_attr(a_prime)
    a_prime[y == 1] = 0  # hack: setting protected attrs to be 0 for
                         # negative examples
    sens_attr = list(a_prime.columns)

    gamma_values = {}
    err_values = {}
    eps_values = {}
    for eps in eps_list:
        res_tuple = red.expgrad(x, a_prime, y, learner,
                                cons=marginal_EO(sens_attr), eps=eps)
        weighted_pred = weighted_predictions(res_tuple, x)
        err_values[eps] = sum(np.abs(y - weighted_pred)) / len(y)  # err 
        eps_values[eps] = compute_FP(a_prime, y, weighted_pred)
        print(eps_values[eps])
    d = {'err' : list(err_values.values()), 'input eps' : eps_list,
         'empirical eps' : list(eps_values.values())}
    return pd.DataFrame(data=d)
Ejemplo n.º 6
0
def reduce_and_predict(df1, df2, epsilon):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = len(df1)
    ntest = len(df2)
    '''
  df = pandas.read_csv("UCIAdult.csv")
  ntrain = 18000
  ntest = 7839
  df.drop(columns = ['Unnamed: 0'])
  '''
    df = pandas.get_dummies(df, prefix = ['income', 'sex', 'native_country', 'marital_status',\
      'workclass', 'occupation'], drop_first = True)
    df = df.rename(columns = {'income_>50K':'income', 'sex_Female':'sex', 'native_country_United-States':'native_country',\
      'marital_status_Not-Married':'marital_status'})
    df = df.astype('int64')
    label_names = ['income']
    protected_attribute_names = ['sex']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    dataX = train_data.drop(columns=['income', 'sex'])
    dataA = train_data['sex']
    dataY = train_data['income']

    X_test = test_data.drop(columns=['income', 'sex'])
    learner = LogisticRegression()

    #moments = reload(moments)
    #red = reload(red)
    results_tuple = red.expgrad(dataX, dataA, dataY, learner, eps=epsilon)
    best_class = results_tuple.best_classifier

    Y_hat = best_class(X_test) >= 0.5
    Y_hat = Y_hat.to_frame()
    Y_hat = np.int64(Y_hat.as_matrix().ravel())

    return Y_hat
Ejemplo n.º 7
0
score, _ = ad.audit_tree(test, feature_audit, 'predict', protected)
#print(unfair_treatment[unfair_treatment.race == 'Caucasian'][feature_audit + ['predict', outcome]].describe())
#print(unfair_treatment[unfair_treatment.race == 'African-American'][feature_audit + ['predict', outcome]].describe())
print(score)

# reduction method
epsilon = 0.01
constraint = moments.EO()
trainX = train[feature_list]
trainY = train[outcome]
trainA = train['attr']
logreg = LogisticRegression()
dct = DecisionTreeClassifier()
res_tuple = red.expgrad(trainX,
                        trainA,
                        trainY,
                        dct,
                        cons=constraint,
                        eps=epsilon)
res = res_tuple._asdict()
best_classifier = res["best_classifier"]
test['predict'] = np.array(best_classifier(np.array(test[feature_list])))
test.loc[test.predict < 0.5, 'predict'] = 0
test.loc[test.predict > 0.5, 'predict'] = 1

# auditing learner
feature_audit = ['ugpa', 'cluster', 'lsat', 'fam_inc']
score, _ = ad.audit_tree(test, feature_audit, 'predict', protected)
#print(unfair_treatment[unfair_treatment.sex == 'Caucasian'][feature_audit + ['predict', outcome]].describe())
#print(unfair_treatment[unfair_treatment.race == 'African-American'][feature_audit + ['predict', outcome]].describe())
print(score)
Ejemplo n.º 8
0
    attrs = [str(x) for x in 'AAAAAAA' 'BBBBBBB' 'CCCCCC']
    labls = [int(x) for x in '0110100' '0010111' '001111']
    feat1 = [int(x) for x in '0110101' '0111101' '001011']
    feat2 = [int(x) for x in '0000100' '0000011' '111111']
    feat3 = [int(x) for x in '1111111' '1111111' '111111']

    dataX = pd.DataFrame({"feat1": feat1, "feat2": feat2, "feat3": feat3})
    dataY = pd.Series(labls)
    dataA = pd.Series(attrs)

    learner = LeastSquaresLearner()

    for test in tests:
        res_tuple = red.expgrad(dataX,
                                dataA,
                                dataY,
                                learner,
                                cons=test["cons_class"](),
                                eps=test["eps"])
        res = res_tuple._asdict()
        Q = res["best_classifier"]
        res["n_classifiers"] = len(res["classifiers"])

        disp = test["cons_class"]()
        disp.init(dataX, dataA, dataY)

        error = moments.MisclassError()
        error.init(dataX, dataA, dataY)

        res["disp"] = disp.gamma(Q).max()
        res["error"] = error.gamma(Q)[0]
        report_header = "testing (%s, eps=%.3f)" \
Ejemplo n.º 9
0
#gmm.fit(trainXA)
#train['cluster2'] = gmm.predict(trainXA)

mbk = MiniBatchKMeans(init='k-means++', n_clusters=10, batch_size=batch_size,
                      n_init=10, max_no_improvement=20, verbose=0, 
					  random_state=1,
					  reassignment_ratio=0.1)

mbk.fit(trainXA[:, :-2])
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
train['cluster2'] = pairwise_distances_argmin(trainXA[:, :-2], mbk_means_cluster_centers)

# bias detecting from fairlearn
learner = LogisticRegression()

res_tuple = red.expgrad(trainX, trainA, trainY, learner,
                                cons=cons, eps=epsilon)
res = res_tuple._asdict()
best_classifier = res["best_classifier"]

predicted = best_classifier(trainX)
train['predicted'] = predicted

bias = train.groupby('cluster').predicted.mean()
bias  = bias * ( 1 - bias)
print(bias)

bias = train.groupby('cluster')[outcome].mean()
bias  = bias * ( 1 - bias)
print(bias)

bias = train.groupby('cluster2').predicted.mean()