예제 #1
0
class Fair_DI_RF():
    
    def __init__(self, sensitive, repair_level, n_est = 100, min_sam_leaf = 25):
        if repair_level<0:
            repair_level = 0
        elif repair_level>1:
            repair_level = 1
        self.model_reweight = DisparateImpactRemover(sensitive_attribute = sensitive, repair_level=repair_level)
        self.model = RandomForestClassifier(n_estimators=n_est, min_samples_leaf=min_sam_leaf)
        
    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df = data, label_names = labels, 
                             protected_attribute_names= prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features,index,1)
        y_train = x.labels.ravel()
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.fit_transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features,index,1)
        y = self.model.predict_proba(x_test)[:,1]
        return y
예제 #2
0
class Fair_DI_NN():
    def __init__(self, sensitive, repair_level, inp_size, num_layers_y,
                 step_y):

        if repair_level < 0:
            repair_level = 0
        elif repair_level > 1:
            repair_level = 1

        self.model_reweight = DisparateImpactRemover(
            sensitive_attribute=sensitive, repair_level=repair_level)
        self.model = FairClass(inp_size, num_layers_y, step_y)

    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df=data,
                                label_names=labels,
                                protected_attribute_names=prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features, index, 1)
        y_train = x.labels
        x_train = torch.tensor(x_train).type('torch.FloatTensor')
        y_train = torch.tensor(y_train).type('torch.FloatTensor')
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.fit_transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features, index, 1)
        x_test = torch.tensor(x_test).type('torch.FloatTensor')
        y = self.model.predict_proba(x_test)
        return y
def test_repair0():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])

    di = DisparateImpactRemover(repair_level=0.)
    ad_repd = di.fit_transform(ad)

    assert ad_repd == ad
예제 #4
0
def disparate_impact_remover(structured_data):
    """
    Perform disparate impact removal from dataset and convert to pandas dataframe.
    
    Parameters:
    aif_standard_data (aif360.datasets.standard_dataset.StandardDataset): Structured dataset.
    
    Returns:
    data_transf_df (pandas dataframe): Pandas dataframe.
    """

    DIR = DisparateImpactRemover()
    data_transf = DIR.fit_transform(structured_data)
    data_transf_df = convert_to_pd_dataframe(data_transf)

    return data_transf_df
예제 #5
0
def train_svm_dir(training_data, C, gamma, keep_features, sensitive_attribute,
                  max_iter, svm_seed):
    """
    Train the SVM classifier with Disparate Impact Remover preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param sensitive_attribute: The sensitive attribute in the dataset
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: The trained classifier and the scaler
    """
    dataset_orig_train = training_data

    # Run Disparate Impact Remover
    di = DisparateImpactRemover(repair_level=0.8,
                                sensitive_attribute=sensitive_attribute)
    dataset_transf_train = di.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    return clf, scale
def test_adult():
    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])
    assert np.any(test.labels)

    train.features = scaler.fit_transform(train.features)
    test.features = scaler.transform(test.features)

    index = train.feature_names.index(protected)
    X_tr = np.delete(train.features, index, axis=1)
    X_te = np.delete(test.features, index, axis=1)
    y_tr = train.labels.ravel()

    di = DisparateImpactRemover(repair_level=1.0)
    train_repd = di.fit_transform(train)
    # train_repd2 = di.fit_transform(train)
    # assert train_repd == train_repd2
    test_repd = di.fit_transform(test)

    assert np.all(
        train_repd.protected_attributes == train.protected_attributes)

    lmod = LogisticRegression(class_weight='balanced')
    # lmod = SVM(class_weight='balanced')
    lmod.fit(X_tr, y_tr)

    test_pred = test.copy()
    test_pred.labels = lmod.predict(X_te)

    X_tr_repd = np.delete(train_repd.features, index, axis=1)
    X_te_repd = np.delete(test_repd.features, index, axis=1)
    y_tr_repd = train_repd.labels.ravel()
    assert (y_tr == y_tr_repd).all()

    lmod.fit(X_tr_repd, y_tr_repd)
    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te_repd)

    p = [{protected: 1}]
    u = [{protected: 0}]

    cm = ClassificationMetric(test,
                              test_pred,
                              privileged_groups=p,
                              unprivileged_groups=u)
    before = cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(before))
    # print('Acc overall: {:.4}'.format(cm.accuracy()))

    repaired_cm = ClassificationMetric(test_repd,
                                       test_repd_pred,
                                       privileged_groups=p,
                                       unprivileged_groups=u)
    after = repaired_cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(after))
    # print('Acc overall: {:.4}'.format(repaired_cm.accuracy()))

    assert after > before
    assert abs(1 - after) <= 0.2
def checkClassifierFairnessAndRemoveDI(frame,
                                       dpoints,
                                       mname,
                                       x_columns,
                                       verbose=True,
                                       pre=True):
    ''' Measure fairness according to the metric using the value of A and the classification outcome.
    Results get added to a dictionary used to pass them to a function to generate graphs of the results.
    If we have not performed intervention, perform intervention and return post intervention data.'''

    xay_columns = copy.deepcopy(x_columns)
    xay_columns.extend(["A", "Y"])

    ycols = copy.deepcopy(frame["Y"])
    tempframe = copy.deepcopy(frame)
    tempframe.drop(["Y"], axis=1, inplace=True)

    aifdf = BinaryLabelDataset(favorable_label=1.0,
                               unfavorable_label=0.0,
                               df=tempframe,
                               label_names=['Ya'],
                               protected_attribute_names=['A'])

    privileged_groups = [{'A': 1}]
    unprivileged_groups = [{'A': 0}]

    DIR = DisparateImpactRemover(sensitive_attribute='A')

    metric_aifdf_train = BinaryLabelDatasetMetric(
        aifdf,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    if pre:
        if verbose:
            print("\n\tINTERVENTION: {}\n".format(type(DIR).__name__))
            print("\t######### PRE {} ###########".format(type(DIR).__name__))
            print(
                "\tDisparate impact between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.disparate_impact()))
        dpoints[mname]['PRE'][type(
            DIR).__name__]['FAIR'] = metric_aifdf_train.disparate_impact()

        print("PRE CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0  | Y'=1 |")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        dataset_transf_train = DIR.fit_transform(aifdf)
        fairdf = dataset_transf_train.convert_to_dataframe()[0]
        fairdf.drop(['Ya'], axis=1, inplace=True)

        ycols.reset_index(drop=True, inplace=True)
        fairdf.reset_index(drop=True, inplace=True)
        fairdf.insert(0, "Y", ycols)

        fairdf[xay_columns] = fairdf[xay_columns]
        return fairdf
    else:
        if verbose:
            print(
                "\tDisparate impact between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.disparate_impact()))
        dpoints[mname]['POST'][type(
            DIR).__name__]['FAIR'] = metric_aifdf_train.disparate_impact()
        print("POST CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0 | Y'=1|")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        return frame
예제 #8
0
    elif m == "lfr":
        TR = LFR(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups)
        TR = TR.fit(dataset_orig_train)
        dataset_transf_train = TR.transform(dataset_orig_train, threshold=0.8)
        out = dataset_transf_train.convert_to_dataframe(de_dummy_code=True,
                                                        sep='=',
                                                        set_category=True)[0]

    elif m == "disp_impact_remover":
        # Test if scaling changes something --> but then also export a scaled test set
        # scaler = MinMaxScaler(copy=False)

        di = DisparateImpactRemover(repair_level=1, sensitive_attribute='AGE')
        dataset_transf_train = di.fit_transform(dataset_orig_train)
        out_train = dataset_transf_train.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

        # valid classification
        dataset_transf_valid = di.fit_transform(dataset_orig_valid)
        out_valid = dataset_transf_valid.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

        # test classification
        dataset_transf_test = di.fit_transform(dataset_orig_test)
        out_test = dataset_transf_test.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_transf_train,
예제 #9
0
def svm_dir(training_data, test_data, fairness_metric, accuracy_metric, C,
            gamma, keep_features, privileged_groups, unprivileged_groups,
            max_iter, svm_seed):
    """
    Run SVM classifier with Disparate Impact Remover preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param test_data: The test data set to test the classifier on
    :param fairness_metric: The fairness metric to calculate
    :param accuracy_metric: The accuracy metric to calculate
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param privileged_groups: The privileged group in the data set
    :param unprivileged_groups: The unprivileged group in the data set
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: Return the accuracy and fairness score for the classifier
    """
    dataset_orig_train, dataset_orig_test = training_data, test_data

    # Run Disparate Impact Remover
    sensitive_attribute = list(privileged_groups[0].keys())[0]
    di = DisparateImpactRemover(repair_level=0.8,
                                sensitive_attribute=sensitive_attribute)
    dataset_transf_train = di.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale.fit_transform(dataset_transf_test_pred.features)
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]
        X_test = X_test[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    # Test
    pos_ind = np.where(clf.classes_ == dataset_orig_train.favorable_label)[0][
        0]  # positive class index
    dataset_transf_test_pred.scores = clf.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)
    # Assign labels
    fav_inds = dataset_transf_test_pred.scores > 0.5
    dataset_transf_test_pred.labels[
        fav_inds] = dataset_transf_test_pred.favorable_label
    dataset_transf_test_pred.labels[
        ~fav_inds] = dataset_transf_test_pred.unfavorable_label

    # Calculate metrics
    cm = ClassificationMetric(dataset_orig_test,
                              dataset_transf_test_pred,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

    accuracy_score = accuracy_metric(cm)
    fairness_score = fairness_metric(cm)
    return accuracy_score, fairness_score
예제 #10
0
 def pre_process(self, annotated_data, privileged_groups,
                 unprivileged_groups):
     disparate_impact_remover = DisparateImpactRemover(
         repair_level=self.repair_level)
     return disparate_impact_remover.fit_transform(annotated_data)