def test_adult_no_drop():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])
    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 48842
def test_adult():
    ad = AdultDataset()
    # print(ad.feature_names)
    assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)

    bldm = BinaryLabelDatasetMetric(ad)
    assert bldm.num_instances() == 45222
Exemple #3
0
def test_repair0():
    ad = AdultDataset(protected_attribute_names=['sex'],
        privileged_classes=[['Male']], categorical_features=[],
        features_to_keep=['age', 'education-num'])

    di = DisparateImpactRemover(repair_level=0.)
    ad_repd = di.fit_transform(ad)

    assert ad_repd == ad
def test_instance_weights():
    ad = AdultDataset(instance_weights_name='fnlwgt', features_to_drop=[])
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    rw = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    transf = rw.fit_transform(ad)
    print(transf.instance_weights.sum())
    assert np.isclose(ad.instance_weights.sum(), transf.instance_weights.sum())
Exemple #5
0
def test_adult():
    np.random.seed(1)
    # np.random.seed(9876)

    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    #scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])

    biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected)
    biased_model.fit(train)

    dataset_bias_test = biased_model.predict(test)

    biased_cm = ClassificationMetric(test,
                                     dataset_bias_test,
                                     unprivileged_groups=[{
                                         protected: 0
                                     }],
                                     privileged_groups=[{
                                         protected: 1
                                     }])
    unconstrainedFDR2 = biased_cm.false_discovery_rate_ratio()
    unconstrainedFDR2 = min(unconstrainedFDR2, 1 / unconstrainedFDR2)

    predictions = [
        1 if y == train.favorable_label else -1
        for y in dataset_bias_test.labels.ravel()
    ]
    y_test = np.array(
        [1 if y == train.favorable_label else -1 for y in test.labels.ravel()])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, unconstrainedFDR = getStats(y_test, predictions, x_control_test)
    assert np.isclose(unconstrainedFDR, unconstrainedFDR2)

    tau = 0.9
    debiased_model = MetaFairClassifier(tau=tau, sensitive_attr=protected)
    debiased_model.fit(train)

    #dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
    dataset_debiasing_test = debiased_model.predict(test)

    predictions = list(dataset_debiasing_test.labels)
    predictions = [
        1 if y == train.favorable_label else -1
        for y in dataset_debiasing_test.labels.ravel()
    ]
    y_test = np.array(
        [1 if y == train.favorable_label else -1 for y in test.labels.ravel()])
    x_control_test = pd.DataFrame(data=test.features,
                                  columns=test.feature_names)[protected]

    acc, sr, fdr = getStats(y_test, predictions, x_control_test)

    debiased_cm = ClassificationMetric(test,
                                       dataset_debiasing_test,
                                       unprivileged_groups=[{
                                           protected: 0
                                       }],
                                       privileged_groups=[{
                                           protected: 1
                                       }])
    fdr2 = debiased_cm.false_discovery_rate_ratio()
    fdr2 = min(fdr2, 1 / fdr2)
    assert np.isclose(fdr, fdr2)
    #print(fdr, unconstrainedFDR)
    assert (fdr2 >= unconstrainedFDR2)
Exemple #6
0
def test_adult():
    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']], categorical_features=[],
                      features_to_keep=['age', 'education-num', 'capital-gain',
                                        'capital-loss', 'hours-per-week'])

    scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])
    assert np.any(test.labels)

    train.features = scaler.fit_transform(train.features)
    test.features = scaler.transform(test.features)

    index = train.feature_names.index(protected)
    X_tr = np.delete(train.features, index, axis=1)
    X_te = np.delete(test.features, index, axis=1)
    y_tr = train.labels.ravel()

    di = DisparateImpactRemover(repair_level=1.0)
    train_repd = di.fit_transform(train)
    # train_repd2 = di.fit_transform(train)
    # assert train_repd == train_repd2
    test_repd = di.fit_transform(test)

    assert np.all(train_repd.protected_attributes ==
                  train.protected_attributes)

    lmod = LogisticRegression(class_weight='balanced', solver='lbfgs')
    # lmod = SVM(class_weight='balanced')
    lmod.fit(X_tr, y_tr)

    test_pred = test.copy()
    test_pred.labels = lmod.predict(X_te)

    X_tr_repd = np.delete(train_repd.features, index, axis=1)
    X_te_repd = np.delete(test_repd.features, index, axis=1)
    y_tr_repd = train_repd.labels.ravel()
    assert (y_tr == y_tr_repd).all()

    lmod.fit(X_tr_repd, y_tr_repd)
    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te_repd)

    p = [{protected: 1}]
    u = [{protected: 0}]

    cm = ClassificationMetric(test, test_pred, privileged_groups=p,
                              unprivileged_groups=u)
    before = cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(before))
    # print('Acc overall: {:.4}'.format(cm.accuracy()))

    repaired_cm = ClassificationMetric(test_repd, test_repd_pred,
                                       privileged_groups=p,
                                       unprivileged_groups=u)
    after = repaired_cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(after))
    # print('Acc overall: {:.4}'.format(repaired_cm.accuracy()))

    assert after > before
    assert abs(1 - after) <= 0.2
Exemple #7
0
def load_preproc_data_adult(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/Adult/code/Generate_Adult_Data.ipynb
        """

        # Group age by decade
        df['Age (decade)'] = df['age'].apply(lambda x: x // 10 * 10)

        # df['Age (decade)'] = df['age'].apply(lambda x: np.floor(x/10.0)*10.0)

        def group_edu(x):
            if x <= 5:
                return '<6'
            elif x >= 13:
                return '>12'
            else:
                return x

        def age_cut(x):
            if x >= 70:
                return '>=70'
            else:
                return x

        def group_race(x):
            if x == "White":
                return 1.0
            else:
                return 0.0

        # Cluster education and age attributes.
        # Limit education range
        df['Education Years'] = df['education-num'].apply(
            lambda x: group_edu(x))
        df['Education Years'] = df['Education Years'].astype('category')

        # Limit age range
        df['Age (decade)'] = df['Age (decade)'].apply(lambda x: age_cut(x))

        # Rename income variable
        df['Income Binary'] = df['income-per-year']

        # Recode sex and race
        df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})
        df['race'] = df['race'].apply(lambda x: group_race(x))

        return df

    XD_features = ['Age (decade)', 'Education Years', 'sex', 'race']
    D_features = ['sex', 'race'
                  ] if protected_attributes is None else protected_attributes
    Y_features = ['Income Binary']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = ['Age (decade)', 'Education Years']

    # privileged classes
    all_privileged_classes = {"sex": [1.0], "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            1.0: 'Male',
            0.0: 'Female'
        },
        "race": {
            1.0: 'White',
            0.0: 'Non-white'
        }
    }

    return AdultDataset(
        label_name=Y_features[0],
        favorable_classes=['>50K', '>50K.'],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=['?'],
        metadata={
            'label_maps': [{
                1.0: '>50K',
                0.0: '<=50K'
            }],
            'protected_attribute_maps':
            [all_protected_attribute_maps[x] for x in D_features]
        },
        custom_preprocessing=custom_preprocessing)
def test_adult_test_set():
    ad = AdultDataset()
    # train, test = ad.split([32561])
    train, test = ad.split([30162])
    assert np.any(test.labels)