def _preprocess_data( data, protected_attribute_name, protected_attribute_index, label_name, required_fairness ): from pandas import DataFrame from aif360.datasets import BinaryLabelDataset dataset = BinaryLabelDataset( df=DataFrame(data), protected_attribute_names={protected_attribute_name}, label_names={label_name}, favorable_label=2, unfavorable_label=1, ) train, test = dataset.split([0.8]) from aif360.algorithms.inprocessing import AdversarialDebiasing sess = tf.compat.v1.Session() debiaser = AdversarialDebiasing( unprivileged_groups=({protected_attribute_name: 0},), privileged_groups=({protected_attribute_name: 1},), scope_name="debiaser", debias=True, sess=sess, ) debiaser.fit(train) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(class_weight="balanced") X_tr = np.delete(train.features, protected_attribute_index, axis=1) y_tr = train.labels.ravel() model.fit(X_tr, y_tr) test_pred = test.copy(deepcopy=True) test_pred.scores = model.predict(np.delete(debiaser.predict(test).features, protected_attribute_index, axis=1)) accuracy = np.sum(np.equal(test.scores, test_pred.scores)) from aif360.metrics import ClassificationMetric disparate_impact = ClassificationMetric( test, test_pred, unprivileged_groups=({protected_attribute_name: 0},), privileged_groups=({protected_attribute_name: 1},), ).disparate_impact() print(f"Accuracy: {accuracy}") print(f"Disparate impact: {disparate_impact}") if disparate_impact > float(required_fairness): raise ValueError( f"Too unfair! Disparate impact was {disparate_impact} but must be less than {required_fairness}" )
def create_data(): import pandas as pd from h2oaicore.models_utils import import_tensorflow tf = import_tensorflow() # above is because aif360 requires tensorflow from aif360.datasets import BinaryLabelDataset from aif360.algorithms.preprocessing.reweighing import Reweighing """ Update the below as needed """ ######### ######### ######### # Path to the data folder_path = 'tmp/' # Data file data_file = 'housing_train_proc.csv' full_data_file = folder_path + data_file if not os.path.isfile(full_data_file): # for testing, just return something if config.hard_asserts: return dt.Frame(np.array([[1, 2, 3], [4, 5, 6]])) else: return [] train = pd.read_csv(full_data_file) validation_test_files = ['housing_test_proc.csv'] validation_split = [0.6, 0.8] # Target column target = 'high_priced' favorable_label = 0 unfavorable_label = 1 # Privleged_group_info = [[Protetected group name 1, prevleged level, unprivleged level], [Protetected group name 2, prevleged level, unprivleged level]] # The protected group columns need to be binary protected_group_info = [['hispanic', 0, 1], ['black', 0, 1]] ######### ######### ######### # Set up protected group info protected_groups = [group_info[0] for group_info in protected_group_info] dataset_orig = BinaryLabelDataset(df=train, label_names=[target], favorable_label=favorable_label, unfavorable_label=unfavorable_label, protected_attribute_names=protected_groups) privileged_groups = [] unprivileged_groups = [] for protected_group in protected_group_info: privileged_groups_dict = {} unprivileged_groups_dict = {} privileged_groups_dict[protected_group[0]] = protected_group[1] unprivileged_groups_dict[protected_group[0]] = protected_group[2] privileged_groups.append(privileged_groups_dict) unprivileged_groups.append(unprivileged_groups_dict) # Fit weights on the full dataset to be used on the external test set, if given RW_full = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW_full.fit(dataset_orig) # Split the original data into train, validation, and test if applicable if len(validation_split) == 1: dataset_orig_train, dataset_orig_valid = dataset_orig.split(validation_split, shuffle=True) elif len(validation_split) == 2: dataset_orig_train_valid, dataset_orig_test = dataset_orig.split([validation_split[1]], shuffle=True) # Fit the weights on both the validation and test set for the test set split RW_train_valid = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW_train_valid.fit(dataset_orig_train_valid) dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split( [validation_split[0] / (validation_split[1])], shuffle=True) else: dataset_orig_train = dataset_orig # Fit weights on the training set only RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW.fit(dataset_orig_train) dataset_transf_train = RW.transform(dataset_orig_train) # Add the weigts to the training set train_df = pd.DataFrame(dataset_transf_train.features, columns=dataset_transf_train.feature_names) train_df[target] = dataset_transf_train.labels.ravel() train_df['weights'] = dataset_transf_train.instance_weights.ravel() # Create datasets with minimum features calculated the given number of days ahead dataset_dict = {} dataset_dict[data_file.split('.')[0] + "_rw_train.csv"] = train_df # Add weights to the validation split (if a validation split was specified) if len(validation_split) >= 1: dataset_transf_valid = RW.transform(dataset_orig_valid) valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names) valid_df[target] = dataset_transf_valid.labels.ravel() valid_df['weights'] = dataset_transf_valid.instance_weights.ravel() dataset_dict[data_file.split('.')[0] + "_rw_validation.csv"] = valid_df # Add weights to the test split (if a test split was specified) if len(validation_split) >= 2: dataset_transf_test = RW_train_valid.transform(dataset_orig_test) test_df = pd.DataFrame(dataset_transf_test.features, columns=dataset_transf_test.feature_names) test_df[target] = dataset_transf_test.labels.ravel() test_df['weights'] = dataset_transf_test.instance_weights.ravel() dataset_dict[data_file.split('.')[0] + "_rw_test.csv"] = test_df # Add weights to the test files (If provided) for valid_file in validation_test_files: valid = pd.read_csv(folder_path + valid_file) dataset_valid_orig = BinaryLabelDataset(df=valid, label_names=[target], favorable_label=favorable_label, unfavorable_label=unfavorable_label, protected_attribute_names=protected_groups) dataset_transf_valid = RW_full.transform(dataset_valid_orig) valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names) valid_df[target] = dataset_transf_valid.labels.ravel() valid_df['weights'] = dataset_transf_valid.instance_weights.ravel() dataset_dict[valid_file.split('.')[0] + "_rw_transformed.csv"] = valid_df return dataset_dict