def test_epsilon_all_groups(): def custom_preprocessing(df): # slight workaround for non-binary protected attribute # feature should be categorical but protected attribute should be numerical mapping = { 'Black': 0, 'White': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4 } df['race-num'] = df.race.map(mapping) return df.fillna('Unknown') nonbinary_ad = AdultDataset( protected_attribute_names=['sex', 'native-country', 'race-num'], privileged_classes=[['Male'], ['United-States'], [1]], categorical_features=[ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race' ], custom_preprocessing=custom_preprocessing) # drop redundant race feature (not relevant to this test) index = nonbinary_ad.feature_names.index('race-num') nonbinary_ad.features = np.delete(nonbinary_ad.features, index, axis=1) nonbinary_ad.feature_names = np.delete(nonbinary_ad.feature_names, index) _, nonbinary_test = nonbinary_ad.split([32561], shuffle=False) dataset_metric = BinaryLabelDatasetMetric(nonbinary_test) eps_data = dataset_metric.smoothed_empirical_differential_fairness() assert eps_data == 2.063813731996515 # verified with reference implementation
def Adult_dataset(name_prot='sex'): dataset_orig = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], features_to_keep=['age', 'education-num']) data, _ = dataset_orig.convert_to_dataframe() data.rename(columns={'income-per-year': 'labels'}, inplace=True) data.reset_index(inplace=True, drop=True) data.to_csv("dataset/Adult.csv")
def test_adult(): protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) #scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) train, test = ad.split([32562]) assert np.any(test.labels) #print(test.labels) biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected) biased_model.fit(train) dataset_bias_test = biased_model.predict(test) predictions = [ 1 if y == train.favorable_label else -1 for y in list(dataset_bias_test.labels) ] y_test = np.array( [1 if y == [train.favorable_label] else -1 for y in test.labels]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test) #print(unconstrainedFDR) tau = 0.9 debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected) debiased_model.fit(train) #dataset_debiasing_train = debiased_model.predict(dataset_orig_train) dataset_debiasing_test = debiased_model.predict(test) predictions = list(dataset_debiasing_test.labels) predictions = [ 1 if y == train.favorable_label else -1 for y in predictions ] y_test = np.array( [1 if y == [train.favorable_label] else -1 for y in test.labels]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, fdr = getStats(y_test, predictions, x_control_test) #print(fdr, unconstrainedFDR) assert (fdr >= unconstrainedFDR) #test_adult()
def Adult_dataset(name_prot='sex'): dataset_orig = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], features_to_keep=['age', 'education-num']) privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] data, _ = dataset_orig.convert_to_dataframe() data.rename(columns={'income-per-year': 'labels'}, inplace=True) data.reset_index(inplace=True, drop=True) sensitive = data[name_prot] output = dataset_orig.labels atribute = data.drop('labels', axis=1, inplace=False) atribute.drop(name_prot, axis=1, inplace=True) return data, atribute, sensitive, output, privileged_groups, unprivileged_groups
def test_adult(): ad = AdultDataset() # print(ad.feature_names) assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5) bldm = BinaryLabelDatasetMetric(ad) assert bldm.num_instances() == 45222
def test_adult_no_drop(): ad = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], categorical_features=[], features_to_keep=['age', 'education-num']) bldm = BinaryLabelDatasetMetric(ad) assert bldm.num_instances() == 48842
def load_dataset(name): if name == 'Adult': ds = AdultDataset() elif name == 'German': ds = GermanDataset() elif name == 'Compas': ds = CompasDataset() return ds, name
def get_data(dataset_used, protected_attribute_used): if dataset_used == "adult": dataset_orig = AdultDataset() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] elif dataset_used == "german": dataset_orig = load_preproc_data_german() dataset_orig.labels -= 1 if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] elif dataset_used == "compas": dataset_orig = CompasDataset() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] elif dataset_used == "bank": dataset_orig = BankDataset() if protected_attribute_used == 1: privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] else: privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] else: raise ValueError(f"{dataset_used} is not an available dataset.") dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=101) dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=101) return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
def test_instance_weights(): ad = AdultDataset(instance_weights_name='fnlwgt', features_to_drop=[]) privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] rw = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) transf = rw.fit_transform(ad) print(transf.instance_weights.sum()) assert np.isclose(ad.instance_weights.sum(), transf.instance_weights.sum())
def test_repair0(): ad = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], categorical_features=[], features_to_keep=['age', 'education-num']) di = DisparateImpactRemover(repair_level=0.) ad_repd = di.fit_transform(ad) assert ad_repd == ad
def getAdultDataset(): dataset = AdultDataset() dataset_orig = load_preproc_data_adult(['sex']) features = ['race', 'sex', 'age decade', 'education years'] domainArray = getAdultDomain() features.append(dataset_orig.label_names[0]) simpleDomain = Domain(features, domainArray) labels = [y[0] for y in dataset_orig.labels] simpleSamples = dataset_orig.features simpleSamples = np.c_[simpleSamples, labels] return simpleDomain, simpleSamples
from sklearn.linear_model import LogisticRegression from aif360.datasets import AdultDataset from aif360.algorithms.postprocessing import EqOddsPostprocessing from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing from aif360.metrics import ClassificationMetric train, val, test = AdultDataset().split([0.4, 0.7]) lr = LogisticRegression(solver='lbfgs').fit(train.features, train.labels) val_pred = val.copy() val_pred.labels = lr.predict(val.features).reshape((-1, 1)) val_pred.scores = lr.predict_proba(val.features)[:, 1] pred = test.copy() pred.labels = lr.predict(test.features).reshape((-1, 1)) pred.scores = lr.predict_proba(test.features)[:, 1] cm_lr = ClassificationMetric(test, pred, unprivileged_groups=[{ 'sex': 0 }], privileged_groups=[{ 'sex': 1 }]) def test_eqodds(): eqo = EqOddsPostprocessing(unprivileged_groups=[{ 'sex': 0
def test_adult_test_set(): ad = AdultDataset() # test, train = ad.split([16281]) test, train = ad.split([15060]) assert np.any(test.labels)
def LoadData(dataset_name,protected_attribute_name,raw=True): optim_options=None if dataset_name == "adult": if raw: dataset_original = AdultDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] if not raw: dataset_original = load_preproc_data_adult(['sex']) optim_options = { "distortion_fun": get_distortion_adult, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "race": privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] if not raw: dataset_original = load_preproc_data_adult(['race']) optim_options = { "distortion_fun": get_distortion_adult, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif dataset_name == "german": if raw: dataset_original = GermanDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] if not raw: dataset_original = load_preproc_data_german(['sex']) optim_options = { "distortion_fun": get_distortion_german, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "age": privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] if not raw: dataset_original = load_preproc_data_german(['age']) optim_options = { "distortion_fun": get_distortion_german, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } dataset_original.labels = 2 - dataset_original.labels dataset_original.unfavorable_label = 0. elif dataset_name == "compas": if raw: dataset_original = CompasDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 0}] unprivileged_groups = [{'sex': 1}] if not raw: dataset_original = load_preproc_data_compas(['sex']) optim_options = { "distortion_fun": get_distortion_compas, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "race": privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] if not raw: dataset_original = load_preproc_data_compas(['race']) optim_options = { "distortion_fun": get_distortion_compas, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } protected_attribute_set={ 'sex':[[{'sex': 1}],[{'sex': 0}]], 'age':[[{'age': 1}],[{'age': 0}]], 'race':[[{'race': 1}],[{'race': 0}]] } if optim_options==None: print('No such dataset & group option:', dataset_name, protected_attribute_name) exit() return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
def test_adult(): protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) train, test = ad.split([32561]) assert np.any(test.labels) train.features = scaler.fit_transform(train.features) test.features = scaler.transform(test.features) index = train.feature_names.index(protected) X_tr = np.delete(train.features, index, axis=1) X_te = np.delete(test.features, index, axis=1) y_tr = train.labels.ravel() di = DisparateImpactRemover(repair_level=1.0) train_repd = di.fit_transform(train) # train_repd2 = di.fit_transform(train) # assert train_repd == train_repd2 test_repd = di.fit_transform(test) assert np.all( train_repd.protected_attributes == train.protected_attributes) lmod = LogisticRegression(class_weight='balanced') # lmod = SVM(class_weight='balanced') lmod.fit(X_tr, y_tr) test_pred = test.copy() test_pred.labels = lmod.predict(X_te) X_tr_repd = np.delete(train_repd.features, index, axis=1) X_te_repd = np.delete(test_repd.features, index, axis=1) y_tr_repd = train_repd.labels.ravel() assert (y_tr == y_tr_repd).all() lmod.fit(X_tr_repd, y_tr_repd) test_repd_pred = test_repd.copy() test_repd_pred.labels = lmod.predict(X_te_repd) p = [{protected: 1}] u = [{protected: 0}] cm = ClassificationMetric(test, test_pred, privileged_groups=p, unprivileged_groups=u) before = cm.disparate_impact() # print('Disparate impact: {:.4}'.format(before)) # print('Acc overall: {:.4}'.format(cm.accuracy())) repaired_cm = ClassificationMetric(test_repd, test_repd_pred, privileged_groups=p, unprivileged_groups=u) after = repaired_cm.disparate_impact() # print('Disparate impact: {:.4}'.format(after)) # print('Acc overall: {:.4}'.format(repaired_cm.accuracy())) assert after > before assert abs(1 - after) <= 0.2
def test_adult(): np.random.seed(1) # np.random.seed(9876) protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) #scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) train, test = ad.split([32561]) biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected) biased_model.fit(train) dataset_bias_test = biased_model.predict(test) biased_cm = ClassificationMetric(test, dataset_bias_test, unprivileged_groups=[{ protected: 0 }], privileged_groups=[{ protected: 1 }]) unconstrainedFDR2 = biased_cm.false_discovery_rate_ratio() unconstrainedFDR2 = min(unconstrainedFDR2, 1 / unconstrainedFDR2) predictions = [ 1 if y == train.favorable_label else -1 for y in dataset_bias_test.labels.ravel() ] y_test = np.array( [1 if y == train.favorable_label else -1 for y in test.labels.ravel()]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test) assert np.isclose(unconstrainedFDR, unconstrainedFDR2) tau = 0.9 debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected) debiased_model.fit(train) #dataset_debiasing_train = debiased_model.predict(dataset_orig_train) dataset_debiasing_test = debiased_model.predict(test) predictions = list(dataset_debiasing_test.labels) predictions = [ 1 if y == train.favorable_label else -1 for y in dataset_debiasing_test.labels.ravel() ] y_test = np.array( [1 if y == train.favorable_label else -1 for y in test.labels.ravel()]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, fdr = getStats(y_test, predictions, x_control_test) debiased_cm = ClassificationMetric(test, dataset_debiasing_test, unprivileged_groups=[{ protected: 0 }], privileged_groups=[{ protected: 1 }]) fdr2 = debiased_cm.false_discovery_rate_ratio() fdr2 = min(fdr2, 1 / fdr2) assert np.isclose(fdr, fdr2) #print(fdr, unconstrainedFDR) assert (fdr2 >= unconstrainedFDR2)
def load_preproc_data_adult(protected_attributes=None): def custom_preprocessing(df): """The custom pre-processing function is adapted from https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb """ np.random.seed(1) # Group age by decade df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10) # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0) def group_edu(x): if x == -1: return 'missing_edu' elif x <= 5: return '<6' elif x >= 13: return '>12' else: return x def age_cut(x): if x >= 70: return '>=70' else: return x def group_race(x): if x == "White": return 1.0 else: return 0.0 # Cluster education and age attributes. # Limit education range df['Education Years'] = df['education-num'].apply( lambda x: group_edu(x)) df['Education Years'] = df['Education Years'].astype('category') # Limit age range df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x)) # Rename income variable df['Income Binary'] = df['income-per-year'] # Recode sex and race df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0}) df['race'] = df['race'].apply(lambda x: group_race(x)) return df XD_features = ['Age (decade)', 'Education Years', 'sex', 'race'] D_features = ['sex', 'race' ] if protected_attributes is None else protected_attributes Y_features = ['Income Binary'] X_features = list(set(XD_features) - set(D_features)) categorical_features = ['Age (decade)', 'Education Years'] # privileged classes all_privileged_classes = {"sex": [1.0], "race": [1.0]} # protected attribute maps all_protected_attribute_maps = { "sex": { 1.0: 'Male', 0.0: 'Female' }, "race": { 1.0: 'White', 0.0: 'Non-white' } } return AdultDataset( label_name=Y_features[0], favorable_classes=['>50K', '>50K.'], protected_attribute_names=D_features, privileged_classes=[all_privileged_classes[x] for x in D_features], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=X_features + Y_features + D_features, na_values=['?'], metadata={ 'label_maps': [{ 1.0: '>50K', 0.0: '<=50K' }], 'protected_attribute_maps': [all_protected_attribute_maps[x] for x in D_features] }, custom_preprocessing=custom_preprocessing)
def get_data(dataset, protected_attribute, seed=101): def protected_attribute_error(): raise ValueError( f'protected attribute {protected_attribute} is not available for dataset {dataset}' ) if dataset == 'adult': from aif360.datasets import AdultDataset dataset_orig = AdultDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] elif protected_attribute == 'sex_or_race': dataset_orig.feature_names += ['sex_or_race'] dataset_orig.features = np.hstack([ dataset_orig.features, np.expand_dims( np.logical_or(*dataset_orig.features[:, [2, 3]].T).astype( np.float64), -1) ]) dataset_orig.protected_attributes = np.hstack([ dataset_orig.protected_attributes, dataset_orig.features[:, [-1]] ]) dataset_orig.protected_attribute_names += ['sex_or_race'] dataset_orig.privileged_protected_attributes += [np.array([1.])] dataset_orig.unprivileged_protected_attributes += [np.array([0.])] privileged_groups = [{'sex_or_race': 1}] unprivileged_groups = [{'sex_or_race': 0}] elif protected_attribute == 'race': privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] else: protected_attribute_error() elif dataset == 'german': from aif360.datasets import GermanDataset dataset_orig = GermanDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] elif protected_attribute == 'age': privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] else: protected_attribute_error() elif dataset == 'compas': from aif360.datasets import CompasDataset dataset_orig = CompasDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 0}] unprivileged_groups = [{'sex': 1}] elif protected_attribute == 'sex_or_race': dataset_orig.feature_names += ['sex_or_race'] dataset_orig.features = np.hstack([ dataset_orig.features, np.expand_dims( np.logical_or(*dataset_orig.features[:, [0, 2]].T).astype( np.float64), -1) ]) dataset_orig.protected_attributes = np.hstack([ dataset_orig.protected_attributes, dataset_orig.features[:, [-1]] ]) dataset_orig.protected_attribute_names += ['sex_or_race'] dataset_orig.privileged_protected_attributes += [np.array([1.])] dataset_orig.unprivileged_protected_attributes += [np.array([0.])] privileged_groups = [{'sex_or_race': 1}] unprivileged_groups = [{'sex_or_race': 0}] elif protected_attribute == 'race': privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] else: protected_attribute_error() elif dataset == 'bank': from aif360.datasets import BankDataset dataset_orig = BankDataset() if protected_attribute == 'age': privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] else: protected_attribute_error() else: raise ValueError(f'{dataset} is not an available dataset.') dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=seed) dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=seed) return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from aif360.datasets import AdultDataset from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric ad = AdultDataset(protected_attribute_names=['race', 'sex', 'native-country'], privileged_classes=[['White'], ['Male'], ['United-States']], categorical_features=[ 'workclass', 'education', 'marital-status', 'occupation', 'relationship' ], custom_preprocessing=lambda df: df.fillna('Unknown')) adult_train, adult_test = ad.split([32561], shuffle=False) scaler = StandardScaler() X = scaler.fit_transform(adult_train.features) test_X = scaler.transform(adult_test.features) clf = LogisticRegression(C=1.0, random_state=0, solver='liblinear') adult_pred = adult_test.copy() adult_pred.labels = clf.fit(X, adult_train.labels.ravel()).predict(test_X) dataset_metric = BinaryLabelDatasetMetric(adult_test) classifier_metric = BinaryLabelDatasetMetric(adult_pred) def test_epsilon_dataset_binary_groups(): eps_data = dataset_metric.smoothed_empirical_differential_fairness() assert eps_data == 1.53679014653623 # verified with reference implementation
import numpy as np from aif360.datasets import AdultDataset from aif360.metrics import ClassificationMetric from aif360.algorithms.inprocessing import MetaFairClassifier protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) test, train = ad.split([16281], shuffle=False) def test_adult_sr(): biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected, type='sr', seed=123).fit(train) dataset_bias_test = biased_model.predict(test) biased_cm = ClassificationMetric(test, dataset_bias_test, unprivileged_groups=[{ protected: 0 }], privileged_groups=[{ protected: 1
def test_adult_test_set(): ad = AdultDataset() # train, test = ad.split([32561]) train, test = ad.split([30162]) assert np.any(test.labels)
from IPython.display import Markdown, display import matplotlib.pyplot as plt from variable_cep import CalibratedEqOddsPostprocessing #modified for varying weight from variable_cep import normed_rates from tqdm import tqdm from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_curve ## import dataset dataset_used = "compas" # "adult", "german", "compas" protected_attribute_used = 2 # 1, 2 # code to identify the protected attributes from all of the dataset features if dataset_used == "adult": dataset_orig = AdultDataset() # dataset_orig = load_preproc_data_adult() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] elif dataset_used == "german": dataset_orig = GermanDataset() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'age': 1}]
import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score import tensorflow as tf from aif360.datasets import AdultDataset from aif360.sklearn.datasets import fetch_adult from aif360.algorithms.inprocessing import AdversarialDebiasing as OldAdversarialDebiasing from aif360.sklearn.inprocessing import AdversarialDebiasing X, y, sample_weight = fetch_adult(numeric_only=True) adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ], features_to_drop=[]) def test_adv_debias_old_reproduce(): """Test that the old AdversarialDebiasing is reproducible.""" sess = tf.Session() old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{ 'sex': 0 }], privileged_groups=[{ 'sex': 1 }], scope_name='old_classifier', sess=sess,
def load_preproc_data_adult(protected_attributes=None): def custom_preprocessing(df): """The custom pre-processing function is adapted from https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb """ np.random.seed(1) # Group age by decade df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10) # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0) def group_edu(x): if x == -1: return 'missing_edu' elif x <= 5: return '<6' elif x >= 13: return '>12' else: return x def age_cut(x): if x >= 70: return '>=70' else: return x def group_race(x): if x == "White": return 1.0 else: return 0.0 # Cluster education and age attributes. # Limit education range df['Education Years'] = df['education-num'].apply( lambda x: group_edu(x)) df['Education Years'] = df['Education Years'].astype('category') # Limit age range df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x)) # Rename income variable df['Income Binary'] = df['income-per-year'] # Recode sex and race df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0}) df['race'] = df['race'].apply(lambda x: group_race(x)) df1 = df[['sex', 'Education Years', 'Age (decade)', 'Income Binary']] tot = [] for index, row in df1.iterrows(): result = '' for j in df1.columns: result = result + str(row[j]) tot.append(result) df1['tmp_feature'] = tot df1['mis_prob'] = 0 for i in df1['tmp_feature'].unique(): if '<=50K' in i and i[0] == '0': df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.8 elif i[0] == '1': df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.08 else: df1.loc[df1['tmp_feature'] == i, 'mis_prob'] = 0.04 new_label = [] for i, j in zip(df1['mis_prob'], df1['Education Years']): if np.random.binomial(1, i, 1)[0] == 1: new_label.append(-1) else: new_label.append(j) df['Education Years'] = new_label print('Total number of missing values') print(len(df.loc[df['Education Years'] == -1, :].index)) print('Total number of observations') print(len(df.index)) return df XD_features = ['Age (decade)', 'Education Years', 'sex'] D_features = ['sex' ] if protected_attributes is None else protected_attributes Y_features = ['Income Binary'] X_features = list(set(XD_features) - set(D_features)) categorical_features = ['Age (decade)', 'Education Years'] # privileged classes all_privileged_classes = {"sex": [1.0]} # protected attribute maps all_protected_attribute_maps = {"sex": {1.0: 'Male', 0.0: 'Female'}} return AdultDataset( label_name=Y_features[0], favorable_classes=['>50K', '>50K.'], protected_attribute_names=D_features, privileged_classes=[all_privileged_classes[x] for x in D_features], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=X_features + Y_features + D_features, na_values=['?'], metadata={ 'label_maps': [{ 1.0: '>50K', 0.0: '<=50K' }], 'protected_attribute_maps': [all_protected_attribute_maps[x] for x in D_features] }, custom_preprocessing=custom_preprocessing)