def test_adult_no_drop(): ad = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], categorical_features=[], features_to_keep=['age', 'education-num']) bldm = BinaryLabelDatasetMetric(ad) assert bldm.num_instances() == 48842
def test_adult(): ad = AdultDataset() # print(ad.feature_names) assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5) bldm = BinaryLabelDatasetMetric(ad) assert bldm.num_instances() == 45222
def test_repair0(): ad = AdultDataset(protected_attribute_names=['sex'], privileged_classes=[['Male']], categorical_features=[], features_to_keep=['age', 'education-num']) di = DisparateImpactRemover(repair_level=0.) ad_repd = di.fit_transform(ad) assert ad_repd == ad
def test_instance_weights(): ad = AdultDataset(instance_weights_name='fnlwgt', features_to_drop=[]) privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] rw = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) transf = rw.fit_transform(ad) print(transf.instance_weights.sum()) assert np.isclose(ad.instance_weights.sum(), transf.instance_weights.sum())
def test_adult(): np.random.seed(1) # np.random.seed(9876) protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=[ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) #scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) train, test = ad.split([32561]) biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected) biased_model.fit(train) dataset_bias_test = biased_model.predict(test) biased_cm = ClassificationMetric(test, dataset_bias_test, unprivileged_groups=[{ protected: 0 }], privileged_groups=[{ protected: 1 }]) unconstrainedFDR2 = biased_cm.false_discovery_rate_ratio() unconstrainedFDR2 = min(unconstrainedFDR2, 1 / unconstrainedFDR2) predictions = [ 1 if y == train.favorable_label else -1 for y in dataset_bias_test.labels.ravel() ] y_test = np.array( [1 if y == train.favorable_label else -1 for y in test.labels.ravel()]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test) assert np.isclose(unconstrainedFDR, unconstrainedFDR2) tau = 0.9 debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected) debiased_model.fit(train) #dataset_debiasing_train = debiased_model.predict(dataset_orig_train) dataset_debiasing_test = debiased_model.predict(test) predictions = list(dataset_debiasing_test.labels) predictions = [ 1 if y == train.favorable_label else -1 for y in dataset_debiasing_test.labels.ravel() ] y_test = np.array( [1 if y == train.favorable_label else -1 for y in test.labels.ravel()]) x_control_test = pd.DataFrame(data=test.features, columns=test.feature_names)[protected] acc, sr, fdr = getStats(y_test, predictions, x_control_test) debiased_cm = ClassificationMetric(test, dataset_debiasing_test, unprivileged_groups=[{ protected: 0 }], privileged_groups=[{ protected: 1 }]) fdr2 = debiased_cm.false_discovery_rate_ratio() fdr2 = min(fdr2, 1 / fdr2) assert np.isclose(fdr, fdr2) #print(fdr, unconstrainedFDR) assert (fdr2 >= unconstrainedFDR2)
def test_adult(): protected = 'sex' ad = AdultDataset(protected_attribute_names=[protected], privileged_classes=[['Male']], categorical_features=[], features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']) scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) train, test = ad.split([32561]) assert np.any(test.labels) train.features = scaler.fit_transform(train.features) test.features = scaler.transform(test.features) index = train.feature_names.index(protected) X_tr = np.delete(train.features, index, axis=1) X_te = np.delete(test.features, index, axis=1) y_tr = train.labels.ravel() di = DisparateImpactRemover(repair_level=1.0) train_repd = di.fit_transform(train) # train_repd2 = di.fit_transform(train) # assert train_repd == train_repd2 test_repd = di.fit_transform(test) assert np.all(train_repd.protected_attributes == train.protected_attributes) lmod = LogisticRegression(class_weight='balanced', solver='lbfgs') # lmod = SVM(class_weight='balanced') lmod.fit(X_tr, y_tr) test_pred = test.copy() test_pred.labels = lmod.predict(X_te) X_tr_repd = np.delete(train_repd.features, index, axis=1) X_te_repd = np.delete(test_repd.features, index, axis=1) y_tr_repd = train_repd.labels.ravel() assert (y_tr == y_tr_repd).all() lmod.fit(X_tr_repd, y_tr_repd) test_repd_pred = test_repd.copy() test_repd_pred.labels = lmod.predict(X_te_repd) p = [{protected: 1}] u = [{protected: 0}] cm = ClassificationMetric(test, test_pred, privileged_groups=p, unprivileged_groups=u) before = cm.disparate_impact() # print('Disparate impact: {:.4}'.format(before)) # print('Acc overall: {:.4}'.format(cm.accuracy())) repaired_cm = ClassificationMetric(test_repd, test_repd_pred, privileged_groups=p, unprivileged_groups=u) after = repaired_cm.disparate_impact() # print('Disparate impact: {:.4}'.format(after)) # print('Acc overall: {:.4}'.format(repaired_cm.accuracy())) assert after > before assert abs(1 - after) <= 0.2
def load_preproc_data_adult(protected_attributes=None): def custom_preprocessing(df): """The custom pre-processing function is adapted from https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb """ # Group age by decade df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10) # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0) def group_edu(x): if x <= 5: return '<6' elif x >= 13: return '>12' else: return x def age_cut(x): if x >= 70: return '>=70' else: return x def group_race(x): if x == "White": return 1.0 else: return 0.0 # Cluster education and age attributes. # Limit education range df['Education Years'] = df['education-num'].apply( lambda x: group_edu(x)) df['Education Years'] = df['Education Years'].astype('category') # Limit age range df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x)) # Rename income variable df['Income Binary'] = df['income-per-year'] # Recode sex and race df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0}) df['race'] = df['race'].apply(lambda x: group_race(x)) return df XD_features = ['Age (decade)', 'Education Years', 'sex', 'race'] D_features = ['sex', 'race' ] if protected_attributes is None else protected_attributes Y_features = ['Income Binary'] X_features = list(set(XD_features) - set(D_features)) categorical_features = ['Age (decade)', 'Education Years'] # privileged classes all_privileged_classes = {"sex": [1.0], "race": [1.0]} # protected attribute maps all_protected_attribute_maps = { "sex": { 1.0: 'Male', 0.0: 'Female' }, "race": { 1.0: 'White', 0.0: 'Non-white' } } return AdultDataset( label_name=Y_features[0], favorable_classes=['>50K', '>50K.'], protected_attribute_names=D_features, privileged_classes=[all_privileged_classes[x] for x in D_features], instance_weights_name=None, categorical_features=categorical_features, features_to_keep=X_features + Y_features + D_features, na_values=['?'], metadata={ 'label_maps': [{ 1.0: '>50K', 0.0: '<=50K' }], 'protected_attribute_maps': [all_protected_attribute_maps[x] for x in D_features] }, custom_preprocessing=custom_preprocessing)
def test_adult_test_set(): ad = AdultDataset() # train, test = ad.split([32561]) train, test = ad.split([30162]) assert np.any(test.labels)