def german_dataset_age(name_prot=['age']): dataset_orig = GermanDataset(protected_attribute_names=name_prot, privileged_classes=[lambda x: x >= 25], features_to_drop=['personal_status', 'sex']) data, _ = dataset_orig.convert_to_dataframe() data.rename(columns={'credit': 'labels'}, inplace=True) data.to_csv("dataset/German_age.csv")
def load_german_dataset(): """ Collect the aif360 preprocessed German Credit Data Set. Assigns 'age' as the protected attribute with age >= 25 considered privileged. Sex-related attributes are removed (the other option for privileged attribute) :return: The German Credit Data Set """ dataset = GermanDataset(protected_attribute_names=['age'], privileged_classes=[lambda x: x >= 25], features_to_drop=['personal_status', 'sex']) dataset_orig_train, dataset_orig_test = dataset.split([0.7], shuffle=True) return dataset_orig_train, dataset_orig_test
def german_dataset(name_prot=['sex']): dataset_orig = GermanDataset(protected_attribute_names=name_prot, features_to_drop=['personal_status', 'age']) privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] data, _ = dataset_orig.convert_to_dataframe() data.rename(columns={'credit': 'labels'}, inplace=True) sensitive = data[name_prot] output = data['labels'] output.replace((1, 2), (0, 1), inplace=True) atribute = data.drop('labels', axis=1, inplace=False) atribute.drop(name_prot, axis=1, inplace=True) return data, atribute, sensitive, output, privileged_groups, unprivileged_groups
def load_dataset(name): if name == 'Adult': ds = AdultDataset() elif name == 'German': ds = GermanDataset() elif name == 'Compas': ds = CompasDataset() return ds, name
def test_german(): gd = GermanDataset() bldm = BinaryLabelDatasetMetric(gd) assert bldm.num_instances() == 1000
def LoadData(dataset_name,protected_attribute_name,raw=True): optim_options=None if dataset_name == "adult": if raw: dataset_original = AdultDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] if not raw: dataset_original = load_preproc_data_adult(['sex']) optim_options = { "distortion_fun": get_distortion_adult, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "race": privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] if not raw: dataset_original = load_preproc_data_adult(['race']) optim_options = { "distortion_fun": get_distortion_adult, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif dataset_name == "german": if raw: dataset_original = GermanDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] if not raw: dataset_original = load_preproc_data_german(['sex']) optim_options = { "distortion_fun": get_distortion_german, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "age": privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] if not raw: dataset_original = load_preproc_data_german(['age']) optim_options = { "distortion_fun": get_distortion_german, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } dataset_original.labels = 2 - dataset_original.labels dataset_original.unfavorable_label = 0. elif dataset_name == "compas": if raw: dataset_original = CompasDataset() if protected_attribute_name == "sex": privileged_groups = [{'sex': 0}] unprivileged_groups = [{'sex': 1}] if not raw: dataset_original = load_preproc_data_compas(['sex']) optim_options = { "distortion_fun": get_distortion_compas, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } elif protected_attribute_name == "race": privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] if not raw: dataset_original = load_preproc_data_compas(['race']) optim_options = { "distortion_fun": get_distortion_compas, "epsilon": 0.05, "clist": [0.99, 1.99, 2.99], "dlist": [.1, 0.05, 0] } protected_attribute_set={ 'sex':[[{'sex': 1}],[{'sex': 0}]], 'age':[[{'age': 1}],[{'age': 0}]], 'race':[[{'race': 1}],[{'race': 0}]] } if optim_options==None: print('No such dataset & group option:', dataset_name, protected_attribute_name) exit() return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
def __init__(self, *args, **kwargs): # remove arguments for sim_args constructor sim_args_names = [ 'mutable_features', 'domains', 'cost_fns', 'discrete' ] sim_args = {k: kwargs.pop(k, None) for k in sim_args_names} kwargs['custom_preprocessing'] = custom_preprocessing kwargs['metadata'] = default_mappings kwargs['categorical_features'] = [ 'credit_history', 'purpose', 'employment', 'other_debtors', 'property', 'installment_plans', 'housing', 'skill_level', 'telephone' ] self.human_readable_labels = { "A40": "car (new)", "A41": "car (used)", "A42": "furniture/equipment", "A43": "radio/television", "A44": "domestic appliances", "A45": "repairs", "A46": "education", "A47": "vacation", "A48": "retraining", "A49": "business", "A410": "others", "A30": "no credits taken", "A31": "all credits at this bank paid back duly", "A32": "existing credits paid back duly till now", "A33": "delay in paying off in the past", "A34": "critical account", "A71": "unemployed", "A72": "< 1 year", "A73": "1 <= ... < 4 years", "A74": "4 <= ... < 7 years", "A75": ">= 7 years", "A101": "none", "A102": "co-applicant", "A103": "guarantor", "A121": "real estate", "A122": "building society savings agreement/life insurance", "A123": "car or other", "A124": "unknown / no property", "A141": "bank", "A142": "stores", "A143": "none", "A151": "rent", "A152": "own", "A153": "for free", "A171": "unemployed/ unskilled - non-resident", "A172": "unskilled - resident", "A173": "skilled employee / official", "A174": "management/ self-employed/ Highly qualified employee/ officer", "A191": "none", "A192": "yes, registered under the customers name" } GermanDataset.__init__(*(tuple([self]) + args), **kwargs) SimMixin.__init__(self, **sim_args)
def german_dataset_sex(name_prot=['sex']): dataset_orig = GermanDataset(protected_attribute_names=name_prot, features_to_drop=['personal_status', 'age']) data, _ = dataset_orig.convert_to_dataframe() data.rename(columns={'credit': 'labels'}, inplace=True) data.to_csv("dataset/German_sex.csv")
def load_preproc_data_german(protected_attributes=None): """ Load and pre-process german credit dataset. Args: protected_attributes(list or None): If None use all possible protected attributes, else subset the protected attributes to the list. Returns: GermanDataset: An instance of GermanDataset with required pre-processing. """ def custom_preprocessing(df): """ Custom pre-processing for German Credit Data """ def group_credit_hist(x): if x in ['A30', 'A31', 'A32']: return 'None/Paid' elif x == 'A33': return 'Delay' elif x == 'A34': return 'Other' else: return 'NA' def group_employ(x): if x == 'A71': return 'Unemployed' elif x in ['A72', 'A73']: return '1-4 years' elif x in ['A74', 'A75']: return '4+ years' else: return 'NA' def group_savings(x): if x in ['A61', 'A62']: return '<500' elif x in ['A63', 'A64']: return '500+' elif x == 'A65': return 'Unknown/None' else: return 'NA' def group_status(x): if x in ['A11', 'A12']: return '<200' elif x in ['A13']: return '200+' elif x == 'A14': return 'None' else: return 'NA' status_map = { 'A91': 1.0, 'A93': 1.0, 'A94': 1.0, 'A92': 0.0, 'A95': 0.0 } df['sex'] = df['personal_status'].replace(status_map) # group credit history, savings, and employment df['credit_history'] = df['credit_history'].apply( lambda x: group_credit_hist(x)) df['savings'] = df['savings'].apply(lambda x: group_savings(x)) df['employment'] = df['employment'].apply(lambda x: group_employ(x)) df['age'] = df['age'].apply(lambda x: np.float(x >= 25)) df['status'] = df['status'].apply(lambda x: group_status(x)) return df # Feature partitions XD_features = ['credit_history', 'savings', 'employment', 'sex', 'age'] D_features = ['sex', 'age' ] if protected_attributes is None else protected_attributes Y_features = ['credit'] X_features = list(set(XD_features) - set(D_features)) categorical_features = ['credit_history', 'savings', 'employment'] # privileged classes all_privileged_classes = {"sex": [1.0], "age": [1.0]} # protected attribute maps all_protected_attribute_maps = { "sex": { 1.0: 'Male', 0.0: 'Female' }, "age": { 1.0: 'Old', 0.0: 'Young' } } return GermanDataset( label_name=Y_features[0], favorable_classes=[1], protected_attribute_names=D_features, privileged_classes=[all_privileged_classes[x] for x in D_features], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=X_features + Y_features + D_features, metadata={ 'label_maps': [{ 1.0: 'Good Credit', 2.0: 'Bad Credit' }], 'protected_attribute_maps': [all_protected_attribute_maps[x] for x in D_features] }, custom_preprocessing=custom_preprocessing)
def main(): import sys sys.path.insert(1, "../") import numpy as np np.random.seed(0) #pip install numba==0.43.0 #pip install --ignore-installed llvmlite==0.32.1 from aif360.datasets import GermanDataset from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric as CM from aif360.algorithms.preprocessing import Reweighing from IPython.display import Markdown, display from sklearn.ensemble import RandomForestClassifier as RF from sklearn.datasets import make_classification as mc from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score #Step 2 Load dataset, specifying protected attribute, and split dataset into train and test dataset_orig = GermanDataset( protected_attribute_names=[ 'age' ], # this dataset also contains protected attribute for "sex" # which we do not consider in this evaluation privileged_classes=[lambda x: x >= 25 ], # age >=25 is considered privileged features_to_drop=['personal_status', 'sex'] # ignore sex-related attributes ) dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True) dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True) privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] #Step 3 Compute fairness metric on original training dataset metric_orig_train = BinaryLabelDatasetMetric( dataset_orig_train, #mean difference unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) display(Markdown("#### Original training dataset")) print( "Difference in mean outcomes between unprivileged and privileged groups = %f. AKA the privileged group is getting .17 more positive outcomes in the training dataset." % metric_orig_train.mean_difference()) # print() #metrics clf = RF() clf.fit(dataset_orig_train.features, dataset_orig_train.labels) predictions = clf.predict(dataset_orig_test.features) proba_predictions = clf.predict_proba(dataset_orig_test.features) dataset_orig_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1) dataset_orig_test_pred.labels = predictions.reshape(-1, 1) cm_pred_valid = CM(dataset_orig_test, dataset_orig_test_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) cm = ["precision", "recall", "accuracy"] metrics = {} for c in cm: metric = eval("cm_pred_valid." + c + "()") metrics[c] = metric metrics["recall"], metrics["accuracy"], metrics["precision"] print("AIF360 metrics") for key in ["recall", "accuracy", "precision"]: print("{} score is: {}".format(key, metrics[key])) #Step 4 Mitigate bias by transforming the original dataset RW = Reweighing( unprivileged_groups= unprivileged_groups, #pre-processing mitigation algorithm privileged_groups=privileged_groups) dataset_transf_train = RW.fit_transform(dataset_orig_train) #Step 5 Compute fairness metric on transformed dataset metric_transf_train = BinaryLabelDatasetMetric( dataset_transf_train, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) display(Markdown("#### Transformed training dataset")) print( "Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_transf_train.mean_difference()) # #metrics #split dataset_transf_train, dataset_transf_test = dataset_transf_train.split( [0.7], shuffle=True) dataset_transf_test_pred = dataset_transf_test.copy(deepcopy=True) clf = RF() clf.fit(dataset_transf_train.features, dataset_transf_train.labels) predictions = clf.predict(dataset_transf_test.features) proba_predictions = clf.predict_proba(dataset_transf_test.features) dataset_transf_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1) dataset_transf_test_pred.labels = predictions.reshape(-1, 1) cm_pred_valid = CM(dataset_transf_test, dataset_transf_test_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) cm = ["precision", "recall", "accuracy"] metrics = {} for c in cm: metric = eval("cm_pred_valid." + c + "()") metrics[c] = metric metrics["recall"], metrics["accuracy"], metrics["precision"] print("AIF360 metrics") for key in ["recall", "accuracy", "precision"]: print("{} score is: {}".format(key, metrics[key]))
dataset_used = "compas" # "adult", "german", "compas" protected_attribute_used = 2 # 1, 2 # code to identify the protected attributes from all of the dataset features if dataset_used == "adult": dataset_orig = AdultDataset() # dataset_orig = load_preproc_data_adult() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] elif dataset_used == "german": dataset_orig = GermanDataset() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] elif dataset_used == "compas": # dataset_orig = CompasDataset() dataset_orig = load_preproc_data_compas() if protected_attribute_used == 1: privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] else: privileged_groups = [{'race': 1}]
def get_data(dataset, protected_attribute, seed=101): def protected_attribute_error(): raise ValueError( f'protected attribute {protected_attribute} is not available for dataset {dataset}' ) if dataset == 'adult': from aif360.datasets import AdultDataset dataset_orig = AdultDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] elif protected_attribute == 'sex_or_race': dataset_orig.feature_names += ['sex_or_race'] dataset_orig.features = np.hstack([ dataset_orig.features, np.expand_dims( np.logical_or(*dataset_orig.features[:, [2, 3]].T).astype( np.float64), -1) ]) dataset_orig.protected_attributes = np.hstack([ dataset_orig.protected_attributes, dataset_orig.features[:, [-1]] ]) dataset_orig.protected_attribute_names += ['sex_or_race'] dataset_orig.privileged_protected_attributes += [np.array([1.])] dataset_orig.unprivileged_protected_attributes += [np.array([0.])] privileged_groups = [{'sex_or_race': 1}] unprivileged_groups = [{'sex_or_race': 0}] elif protected_attribute == 'race': privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] else: protected_attribute_error() elif dataset == 'german': from aif360.datasets import GermanDataset dataset_orig = GermanDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] elif protected_attribute == 'age': privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] else: protected_attribute_error() elif dataset == 'compas': from aif360.datasets import CompasDataset dataset_orig = CompasDataset() if protected_attribute == 'sex': privileged_groups = [{'sex': 0}] unprivileged_groups = [{'sex': 1}] elif protected_attribute == 'sex_or_race': dataset_orig.feature_names += ['sex_or_race'] dataset_orig.features = np.hstack([ dataset_orig.features, np.expand_dims( np.logical_or(*dataset_orig.features[:, [0, 2]].T).astype( np.float64), -1) ]) dataset_orig.protected_attributes = np.hstack([ dataset_orig.protected_attributes, dataset_orig.features[:, [-1]] ]) dataset_orig.protected_attribute_names += ['sex_or_race'] dataset_orig.privileged_protected_attributes += [np.array([1.])] dataset_orig.unprivileged_protected_attributes += [np.array([0.])] privileged_groups = [{'sex_or_race': 1}] unprivileged_groups = [{'sex_or_race': 0}] elif protected_attribute == 'race': privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] else: protected_attribute_error() elif dataset == 'bank': from aif360.datasets import BankDataset dataset_orig = BankDataset() if protected_attribute == 'age': privileged_groups = [{'age': 1}] unprivileged_groups = [{'age': 0}] else: protected_attribute_error() else: raise ValueError(f'{dataset} is not an available dataset.') dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=seed) dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=seed) return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups