예제 #1
0
 def __init__(self, sensitive, repair_level, n_est = 100, min_sam_leaf = 25):
     if repair_level<0:
         repair_level = 0
     elif repair_level>1:
         repair_level = 1
     self.model_reweight = DisparateImpactRemover(sensitive_attribute = sensitive, repair_level=repair_level)
     self.model = RandomForestClassifier(n_estimators=n_est, min_samples_leaf=min_sam_leaf)
예제 #2
0
class Fair_DI_NN():
    def __init__(self, sensitive, repair_level, inp_size, num_layers_y,
                 step_y):

        if repair_level < 0:
            repair_level = 0
        elif repair_level > 1:
            repair_level = 1

        self.model_reweight = DisparateImpactRemover(
            sensitive_attribute=sensitive, repair_level=repair_level)
        self.model = FairClass(inp_size, num_layers_y, step_y)

    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df=data,
                                label_names=labels,
                                protected_attribute_names=prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features, index, 1)
        y_train = x.labels
        x_train = torch.tensor(x_train).type('torch.FloatTensor')
        y_train = torch.tensor(y_train).type('torch.FloatTensor')
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.fit_transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features, index, 1)
        x_test = torch.tensor(x_test).type('torch.FloatTensor')
        y = self.model.predict_proba(x_test)
        return y
예제 #3
0
class Fair_DI_RF():
    
    def __init__(self, sensitive, repair_level, n_est = 100, min_sam_leaf = 25):
        if repair_level<0:
            repair_level = 0
        elif repair_level>1:
            repair_level = 1
        self.model_reweight = DisparateImpactRemover(sensitive_attribute = sensitive, repair_level=repair_level)
        self.model = RandomForestClassifier(n_estimators=n_est, min_samples_leaf=min_sam_leaf)
        
    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df = data, label_names = labels, 
                             protected_attribute_names= prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features,index,1)
        y_train = x.labels.ravel()
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.fit_transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features,index,1)
        y = self.model.predict_proba(x_test)[:,1]
        return y
def test_repair0():
    ad = AdultDataset(protected_attribute_names=['sex'],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=['age', 'education-num'])

    di = DisparateImpactRemover(repair_level=0.)
    ad_repd = di.fit_transform(ad)

    assert ad_repd == ad
예제 #5
0
    def __init__(self, sensitive, repair_level, inp_size, num_layers_y,
                 step_y):

        if repair_level < 0:
            repair_level = 0
        elif repair_level > 1:
            repair_level = 1

        self.model_reweight = DisparateImpactRemover(
            sensitive_attribute=sensitive, repair_level=repair_level)
        self.model = FairClass(inp_size, num_layers_y, step_y)
    def __init__(self, df, target_col, sensitive_att, repair_level):
        """
        :param df: pandas dataframe, stores the data to fit the scaler.
        :param target_col: str, the name of the target variable in above data.
        :param target_positive_value: str, the value of above target variable that represents positive outcome. default is 1.
        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.

        """
        if repair_level is None or not isinstance(repair_level, float):
            print(
                "Input repair_level is not valid! Should be float within [0,1]!"
            )
            raise ValueError
        else:
            if repair_level < 0 or repair_level > 1:
                print(
                    "Input repair_level is not valid! Should be float within [0,1]!"
                )
                raise ValueError
        self.repair_level = repair_level
        cur_step = DisparateImpactRemover(repair_level=repair_level,
                                          sensitive_attribute=sensitive_att)

        super().__init__("@".join(["AIF_DIRemover", sensitive_att]),
                         df,
                         step=cur_step,
                         fit_flag=False,
                         sensitive_att=sensitive_att,
                         target_col=target_col,
                         fair_aware=True)
예제 #7
0
def disparate_impact_remover(structured_data):
    """
    Perform disparate impact removal from dataset and convert to pandas dataframe.
    
    Parameters:
    aif_standard_data (aif360.datasets.standard_dataset.StandardDataset): Structured dataset.
    
    Returns:
    data_transf_df (pandas dataframe): Pandas dataframe.
    """

    DIR = DisparateImpactRemover()
    data_transf = DIR.fit_transform(structured_data)
    data_transf_df = convert_to_pd_dataframe(data_transf)

    return data_transf_df
예제 #8
0
def train_svm_dir(training_data, C, gamma, keep_features, sensitive_attribute,
                  max_iter, svm_seed):
    """
    Train the SVM classifier with Disparate Impact Remover preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param sensitive_attribute: The sensitive attribute in the dataset
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: The trained classifier and the scaler
    """
    dataset_orig_train = training_data

    # Run Disparate Impact Remover
    di = DisparateImpactRemover(repair_level=0.8,
                                sensitive_attribute=sensitive_attribute)
    dataset_transf_train = di.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    return clf, scale
def test_adult():
    protected = 'sex'
    ad = AdultDataset(protected_attribute_names=[protected],
                      privileged_classes=[['Male']],
                      categorical_features=[],
                      features_to_keep=[
                          'age', 'education-num', 'capital-gain',
                          'capital-loss', 'hours-per-week'
                      ])

    scaler = MinMaxScaler(copy=False)
    # ad.features = scaler.fit_transform(ad.features)

    train, test = ad.split([32561])
    assert np.any(test.labels)

    train.features = scaler.fit_transform(train.features)
    test.features = scaler.transform(test.features)

    index = train.feature_names.index(protected)
    X_tr = np.delete(train.features, index, axis=1)
    X_te = np.delete(test.features, index, axis=1)
    y_tr = train.labels.ravel()

    di = DisparateImpactRemover(repair_level=1.0)
    train_repd = di.fit_transform(train)
    # train_repd2 = di.fit_transform(train)
    # assert train_repd == train_repd2
    test_repd = di.fit_transform(test)

    assert np.all(
        train_repd.protected_attributes == train.protected_attributes)

    lmod = LogisticRegression(class_weight='balanced')
    # lmod = SVM(class_weight='balanced')
    lmod.fit(X_tr, y_tr)

    test_pred = test.copy()
    test_pred.labels = lmod.predict(X_te)

    X_tr_repd = np.delete(train_repd.features, index, axis=1)
    X_te_repd = np.delete(test_repd.features, index, axis=1)
    y_tr_repd = train_repd.labels.ravel()
    assert (y_tr == y_tr_repd).all()

    lmod.fit(X_tr_repd, y_tr_repd)
    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te_repd)

    p = [{protected: 1}]
    u = [{protected: 0}]

    cm = ClassificationMetric(test,
                              test_pred,
                              privileged_groups=p,
                              unprivileged_groups=u)
    before = cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(before))
    # print('Acc overall: {:.4}'.format(cm.accuracy()))

    repaired_cm = ClassificationMetric(test_repd,
                                       test_repd_pred,
                                       privileged_groups=p,
                                       unprivileged_groups=u)
    after = repaired_cm.disparate_impact()
    # print('Disparate impact: {:.4}'.format(after))
    # print('Acc overall: {:.4}'.format(repaired_cm.accuracy()))

    assert after > before
    assert abs(1 - after) <= 0.2
def runWysiwygDIAnalysis(fname="wysiwygdata.csv",
                         trainwithA=True,
                         xcolumns=["X1", "X2", "X3", "X4", "X5", "X6"]):
    ''' run the full experiment, based on a parameters like train classifier with A (sensitive attribute)'''

    c_columns = ["C1", "C2", "C3", "C4", "C5", "C6"]
    x_columns = xcolumns

    wysiwygframe = prepareWysiwygData(fname=fname, c_columns=c_columns)

    LR = LogisticRegression(solver='liblinear', random_state=1)
    SVM = SVC(random_state=1, gamma='scale')
    RF = RandomForestClassifier(n_estimators=100, random_state=1)
    DIRmodels = [LR, SVM, RF]

    DIR = DisparateImpactRemover(sensitive_attribute='A')
    dirname = type(DIR).__name__

    wysiwygpoints = {}
    for model in DIRmodels:
        modelname = type(model).__name__
        wysiwygpoints[modelname] = {}
        wysiwygpoints[modelname]['PRE'] = {}
        wysiwygpoints[modelname]['POST'] = {}
        wysiwygpoints[modelname]['PRE'][dirname] = {}

        wysiwygpoints[modelname]['PRE'][dirname]['ACC'] = 0.0
        wysiwygpoints[modelname]['PRE'][dirname]['ACCA0'] = 0.0
        wysiwygpoints[modelname]['PRE'][dirname]['ACCA1'] = 0.0

        wysiwygpoints[modelname]['PRE'][dirname]['FAIR'] = 0.0

        wysiwygpoints[modelname]['POST'][dirname] = {}
        wysiwygpoints[modelname]['POST'][dirname]['ACC'] = 0.0
        wysiwygpoints[modelname]['POST'][dirname]['ACCA0'] = 0.0
        wysiwygpoints[modelname]['POST'][dirname]['ACCA1'] = 0.0

        wysiwygpoints[modelname]['POST'][dirname]['FAIR'] = 0.0

    for model in DIRmodels:
        modelname = type(model).__name__
        print("####### {} #########\n".format(modelname))
        preaccs = trainWysiwygDataFromFrame(wysiwygframe,
                                            model,
                                            x_columns=x_columns,
                                            trainwithA=trainwithA)
        wysiwygpoints[modelname]['PRE'][dirname]['ACC'] = preaccs[1]
        wysiwygpoints[modelname]['PRE'][dirname]['ACCA0'] = preaccs[2]
        wysiwygpoints[modelname]['PRE'][dirname]['ACCA1'] = preaccs[3]

        dirdata = checkClassifierFairnessAndRemoveDI(preaccs[4],
                                                     x_columns=x_columns,
                                                     dpoints=wysiwygpoints,
                                                     mname=modelname,
                                                     verbose=True,
                                                     pre=True)
        print("\t######### POST {} ###########".format(dirname))

        postaccs = trainWysiwygDataFromFrame(dirdata,
                                             model,
                                             x_columns=x_columns,
                                             trainwithA=trainwithA)
        wysiwygpoints[modelname]['POST'][dirname]['ACC'] = postaccs[1]
        wysiwygpoints[modelname]['POST'][dirname]['ACCA0'] = postaccs[2]
        wysiwygpoints[modelname]['POST'][dirname]['ACCA1'] = postaccs[3]

        checkClassifierFairnessAndRemoveDI(postaccs[4],
                                           x_columns=x_columns,
                                           dpoints=wysiwygpoints,
                                           mname=modelname,
                                           verbose=True,
                                           pre=False)

    return wysiwygpoints
def checkClassifierFairnessAndRemoveDI(frame,
                                       dpoints,
                                       mname,
                                       x_columns,
                                       verbose=True,
                                       pre=True):
    ''' Measure fairness according to the metric using the value of A and the classification outcome.
    Results get added to a dictionary used to pass them to a function to generate graphs of the results.
    If we have not performed intervention, perform intervention and return post intervention data.'''

    xay_columns = copy.deepcopy(x_columns)
    xay_columns.extend(["A", "Y"])

    ycols = copy.deepcopy(frame["Y"])
    tempframe = copy.deepcopy(frame)
    tempframe.drop(["Y"], axis=1, inplace=True)

    aifdf = BinaryLabelDataset(favorable_label=1.0,
                               unfavorable_label=0.0,
                               df=tempframe,
                               label_names=['Ya'],
                               protected_attribute_names=['A'])

    privileged_groups = [{'A': 1}]
    unprivileged_groups = [{'A': 0}]

    DIR = DisparateImpactRemover(sensitive_attribute='A')

    metric_aifdf_train = BinaryLabelDatasetMetric(
        aifdf,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    if pre:
        if verbose:
            print("\n\tINTERVENTION: {}\n".format(type(DIR).__name__))
            print("\t######### PRE {} ###########".format(type(DIR).__name__))
            print(
                "\tDisparate impact between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.disparate_impact()))
        dpoints[mname]['PRE'][type(
            DIR).__name__]['FAIR'] = metric_aifdf_train.disparate_impact()

        print("PRE CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0  | Y'=1 |")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        dataset_transf_train = DIR.fit_transform(aifdf)
        fairdf = dataset_transf_train.convert_to_dataframe()[0]
        fairdf.drop(['Ya'], axis=1, inplace=True)

        ycols.reset_index(drop=True, inplace=True)
        fairdf.reset_index(drop=True, inplace=True)
        fairdf.insert(0, "Y", ycols)

        fairdf[xay_columns] = fairdf[xay_columns]
        return fairdf
    else:
        if verbose:
            print(
                "\tDisparate impact between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.disparate_impact()))
        dpoints[mname]['POST'][type(
            DIR).__name__]['FAIR'] = metric_aifdf_train.disparate_impact()
        print("POST CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0 | Y'=1|")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        return frame
예제 #12
0
                      dataset_orig_train.instance_weights.sum()) < 1e-6

    elif m == "lfr":
        TR = LFR(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups)
        TR = TR.fit(dataset_orig_train)
        dataset_transf_train = TR.transform(dataset_orig_train, threshold=0.8)
        out = dataset_transf_train.convert_to_dataframe(de_dummy_code=True,
                                                        sep='=',
                                                        set_category=True)[0]

    elif m == "disp_impact_remover":
        # Test if scaling changes something --> but then also export a scaled test set
        # scaler = MinMaxScaler(copy=False)

        di = DisparateImpactRemover(repair_level=1, sensitive_attribute='AGE')
        dataset_transf_train = di.fit_transform(dataset_orig_train)
        out_train = dataset_transf_train.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

        # valid classification
        dataset_transf_valid = di.fit_transform(dataset_orig_valid)
        out_valid = dataset_transf_valid.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

        # test classification
        dataset_transf_test = di.fit_transform(dataset_orig_test)
        out_test = dataset_transf_test.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]

    metric_transf_train = BinaryLabelDatasetMetric(
예제 #13
0
def svm_dir(training_data, test_data, fairness_metric, accuracy_metric, C,
            gamma, keep_features, privileged_groups, unprivileged_groups,
            max_iter, svm_seed):
    """
    Run SVM classifier with Disparate Impact Remover preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param test_data: The test data set to test the classifier on
    :param fairness_metric: The fairness metric to calculate
    :param accuracy_metric: The accuracy metric to calculate
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param privileged_groups: The privileged group in the data set
    :param unprivileged_groups: The unprivileged group in the data set
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: Return the accuracy and fairness score for the classifier
    """
    dataset_orig_train, dataset_orig_test = training_data, test_data

    # Run Disparate Impact Remover
    sensitive_attribute = list(privileged_groups[0].keys())[0]
    di = DisparateImpactRemover(repair_level=0.8,
                                sensitive_attribute=sensitive_attribute)
    dataset_transf_train = di.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale.fit_transform(dataset_transf_test_pred.features)
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]
        X_test = X_test[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    # Test
    pos_ind = np.where(clf.classes_ == dataset_orig_train.favorable_label)[0][
        0]  # positive class index
    dataset_transf_test_pred.scores = clf.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)
    # Assign labels
    fav_inds = dataset_transf_test_pred.scores > 0.5
    dataset_transf_test_pred.labels[
        fav_inds] = dataset_transf_test_pred.favorable_label
    dataset_transf_test_pred.labels[
        ~fav_inds] = dataset_transf_test_pred.unfavorable_label

    # Calculate metrics
    cm = ClassificationMetric(dataset_orig_test,
                              dataset_transf_test_pred,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

    accuracy_score = accuracy_metric(cm)
    fairness_score = fairness_metric(cm)
    return accuracy_score, fairness_score
예제 #14
0
 def pre_process(self, annotated_data, privileged_groups,
                 unprivileged_groups):
     disparate_impact_remover = DisparateImpactRemover(
         repair_level=self.repair_level)
     return disparate_impact_remover.fit_transform(annotated_data)