Ejemplo n.º 1
0
class Fair_rew_NN():
    def __init__(self, un_gr, pr_gr, inp_size, num_layers_y, step_y):

        self.model_reweight = Reweighing(un_gr, pr_gr)
        self.model = FairClass(inp_size, num_layers_y, step_y)

    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df=data,
                                label_names=labels,
                                protected_attribute_names=prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features, index, 1)
        y_train = x.labels
        x_train = torch.tensor(x_train).type('torch.FloatTensor')
        y_train = torch.tensor(y_train).type('torch.FloatTensor')
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features, index, 1)
        x_test = torch.tensor(x_test).type('torch.FloatTensor')
        y = self.model.predict_proba(x_test)
        return y
def Preprocessing(dataset, label, unprivileged_groups, privileged_groups,
                  protected_attribute, favorable_label, unfavorable_label):
    binary_dataset = generate_binary_label_dataset(dataset, label,
                                                   protected_attribute,
                                                   favorable_label,
                                                   unfavorable_label)
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    dataset_transformed = RW.fit_transform(binary_dataset)
    return dataset_transformed.convert_to_dataframe()[0]
Ejemplo n.º 3
0
def test_reweighing_sex():
    """Test that the old and new Reweighing produce the same sample_weights."""
    orig_rew = OrigReweighing(unprivileged_groups=[{'sex': 0}],
                              privileged_groups=[{'sex': 1}])
    adult_fair = orig_rew.fit_transform(adult)
    rew = Reweighing('sex')
    _, new_sample_weight = rew.fit_transform(X, y, sample_weight=sample_weight)

    assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav],
                        [orig_rew.w_p_unfav, orig_rew.w_p_fav]],
                       rew.reweigh_factors_)
    assert np.allclose(adult_fair.instance_weights, new_sample_weight)
Ejemplo n.º 4
0
def main():
    print('Calculate bias')
    np.random.seed(1)
    protected_attribute = 'ethnicity'
    dataset = load_preproc_data_heart([protected_attribute])

    privileged_groups = [{protected_attribute: 1}]
    unprivileged_groups = [{
        protected_attribute: 2
    }, {
        protected_attribute: 3
    }, {
        protected_attribute: 4
    }, {
        protected_attribute: 5
    }, {
        protected_attribute: 6
    }]

    data_orig_train, data_orig_vt = dataset.split([0.7], shuffle=True)
    data_orig_valid, data_orig_test = data_orig_vt.split([0.5], shuffle=True)

    metric_orig_train = BinaryLabelDatasetMetric(
        data_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    print("Mean {}".format(metric_orig_train.mean_difference()))

    rw = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    data_transf_train = rw.fit_transform(data_orig_train)
    metric_transf_train = BinaryLabelDatasetMetric(
        data_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    print("Mean difference after transformation =%f " %
          metric_transf_train.mean_difference())

    calculate_bias_measures(data_orig_train, data_orig_vt, unprivileged_groups,
                            privileged_groups)
    calculate_bias_measures(data_orig_valid, data_orig_test,
                            unprivileged_groups, privileged_groups)
Ejemplo n.º 5
0
def train_svm_reweighing(training_data, C, gamma, keep_features,
                         privileged_groups, unprivileged_groups, max_iter,
                         svm_seed):
    """
    Train the SVM classifier with Reweighing preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param privileged_groups: The privileged group in the data set
    :param unprivileged_groups: The unprivileged group in the data set
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: The trained classifier and the scaler
    """
    dataset_orig_train = training_data

    # Run Reweighing
    rw = Reweighing(privileged_groups=privileged_groups,
                    unprivileged_groups=unprivileged_groups)
    dataset_transf_train = rw.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    return clf, scale
Ejemplo n.º 6
0
class Fair_rew_RF():
    def __init__(self, un_gr, pr_gr, n_est=100, min_sam_leaf=25):
        self.model_reweight = Reweighing(un_gr, pr_gr)
        self.model = RandomForestClassifier(n_estimators=n_est,
                                            min_samples_leaf=min_sam_leaf)

    def fit(self, data, labels, prot):
        ds = BinaryLabelDataset(df=data,
                                label_names=labels,
                                protected_attribute_names=prot)
        self.prot = prot
        x = self.model_reweight.fit_transform(ds)
        index = x.feature_names.index(prot[0])
        x_train = np.delete(x.features, index, 1)
        y_train = x.labels.ravel()
        self.model.fit(x_train, y_train)

    def predict_proba(self, data_test):
        x = self.model_reweight.transform(data_test)
        index = x.feature_names.index(self.prot[0])
        x_test = np.delete(x.features, index, 1)
        y = self.model.predict_proba(x_test)[:, 1]
        return y
Ejemplo n.º 7
0
def reweight(structured_data, priv_category):
    """
    Remove bias from dataset using Reweighing.
    Parameters:
    structured_data (aif360.datasets.standard_dataset.StandardDataset): Structured dataset.
    priv_category (string): Column with privileged class.
    Returns:
    data_transf_df (pandas dataframe): Pandas dataframe.
    """

    # Get privileged and unprivileged groups
    privileged_groups, unprivileged_groups = get_attributes(
        structured_data, selected_attr=[priv_category])
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)

    # Remove bias
    data_transf = RW.fit_transform(structured_data)

    # Convert to pandas dataframe
    data_transf_df = convert_to_pd_dataframe(data_transf)

    return data_transf_df
Ejemplo n.º 8
0
    def fit(self, dataset):
        RW = Reweighing(unprivileged_groups=self.unprivileged_group,
                        privileged_groups=self.privileged_group)

        mean_diff_metric = lambda dataset: BinaryLabelDatasetMetric(
            dataset,
            unprivileged_groups=self.unprivileged_group,
            privileged_groups=self.privileged_group).mean_difference()
        dataset_ = RW.fit_transform(dataset)

        print("before reweighing (meandiff):", mean_diff_metric(dataset),
              "after:", mean_diff_metric(dataset_))

        #reg_ = LogisticRegression(solver='liblinear',max_iter=1000000000, C=1000000000000000000000.0).fit(dataset_.features, dataset_.labels.ravel())

        reg = LogisticRegression(solver='liblinear', max_iter=1000000000).fit(
            dataset_.features,
            dataset_.labels.ravel(),
            sample_weight=dataset_.instance_weights)
        #print("reweighted",sorted(list(zip(dataset.feature_names,reg.coef_[0])),key=lambda x: abs(x[1])))

        #print(sorted(list(zip(dataset.feature_names,reg.coef_[0])),key=lambda x: abs(x[1])))

        self.h = reg
Ejemplo n.º 9
0
def checkClassifierFairnessAndReweighData(frame,
                                          dpoints,
                                          mname,
                                          x_columns,
                                          verbose=True,
                                          pre=True):
    ''' Measure fairness according to the metric using the value of A and the classification outcome.
    Results get added to a dictionary used to pass them to a function to generate graphs of the results.
    If we have not performed intervention, perform intervention and return post intervention data.'''
    xay_columns = copy.deepcopy(x_columns)
    xay_columns.extend(["A", "Y"])

    ycols = copy.deepcopy(frame["Y"])
    tempframe = copy.deepcopy(frame)
    tempframe.drop(["Y"], axis=1, inplace=True)
    aifdf = BinaryLabelDataset(favorable_label=1.0,
                               unfavorable_label=0.0,
                               df=tempframe,
                               label_names=['Ya'],
                               protected_attribute_names=['A'])

    privileged_groups = [{'A': 1}]
    unprivileged_groups = [{'A': 0}]

    RW = Reweighing(unprivileged_groups=[{
        'A': 0
    }],
                    privileged_groups=[{
                        'A': 1
                    }])

    metric_aifdf_train = BinaryLabelDatasetMetric(
        aifdf,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    if pre:
        if verbose:
            print("\n\tINTERVENTION: {}\n".format(type(RW).__name__))
            print("\t######### PRE {} ###########".format(type(RW).__name__))
            print(
                "\tDifference in mean outcomes between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.mean_difference()))
        dpoints[mname]['PRE'][type(
            RW).__name__]['FAIR'] = metric_aifdf_train.mean_difference()

        print("PRE CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0  | Y'=1 |")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        dataset_transf_train = RW.fit_transform(aifdf)
        fairdf = dataset_transf_train.convert_to_dataframe()[0]
        fairdf.drop(['Ya'], axis=1, inplace=True)

        ycols.reset_index(drop=True, inplace=True)
        fairdf.reset_index(drop=True, inplace=True)
        fairdf.insert(0, "Y", ycols)

        fairdf[xay_columns] = fairdf[xay_columns].astype(int)

        fairdf.insert(loc=len(fairdf.columns),
                      column="weights",
                      value=dataset_transf_train.instance_weights)
        return fairdf
    else:
        if verbose:
            print(
                "\tDifference in mean outcomes between unprivileged and privileged groups = {}\n"
                .format(metric_aifdf_train.mean_difference()))
        dpoints[mname]['POST'][type(
            RW).__name__]['FAIR'] = metric_aifdf_train.mean_difference()

        print("POST CLASSIFICATION MATRIX")
        print("----------------")
        print("   |Y'=0  | Y'=1 |")
        print("----------------")
        print("A=0| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(False),
            metric_aifdf_train.num_positives(False)))
        print("A=1| {0} | {1} |".format(
            metric_aifdf_train.num_negatives(True),
            metric_aifdf_train.num_positives(True)))
        print("----------------")

        return frame
print("Sex biased - Difference in mean outcomes between unprivileged and privileged sex groups = %f" % metric_train_biased.mean_difference())

# Create the metric object for testing set
metric_test_biased = BinaryLabelDatasetMetric(test_biased,
                                            unprivileged_groups=sex_unprivileged_groups,
                                            privileged_groups=sex_privileged_groups)

display(Markdown("#### Original training dataset"))
print("Sex biased - Difference in mean outcomes between unprivileged and privileged sex groups = %f" % metric_test_biased.mean_difference())


#debias with the reweighing method
RW = Reweighing(unprivileged_groups=sex_unprivileged_groups,
                privileged_groups=sex_privileged_groups)
RW.fit(train_biased)
dataset_transf_train_f = RW.fit_transform(train_biased)

# Metric for the reweighted dataset
metric_reweigh = BinaryLabelDatasetMetric(dataset_transf_train_f, 
                                              unprivileged_groups=sex_unprivileged_groups,
                                              privileged_groups=sex_privileged_groups)
display(Markdown("#### Original training dataset"))
print("Sex debiased - Difference in mean outcomes between unprivileged and privileged sex groups = %f" % metric_reweigh.mean_difference())

debiasedReg = LogisticRegression(max_iter=700)
debiasedReg.fit(x_train, y_train, sample_weight= dataset_transf_train_f.instance_weights)

#evaluate debiased model

#accuracy assessment
print('Sex debiased - Score (train set):', debiasedReg.score(x_train,y_train))
def main():
    import sys
    sys.path.insert(1, "../")

    import numpy as np
    np.random.seed(0)

    #pip install numba==0.43.0
    #pip install --ignore-installed llvmlite==0.32.1

    from aif360.datasets import GermanDataset
    from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric as CM
    from aif360.algorithms.preprocessing import Reweighing

    from IPython.display import Markdown, display

    from sklearn.ensemble import RandomForestClassifier as RF
    from sklearn.datasets import make_classification as mc
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    #Step 2 Load dataset, specifying protected attribute, and split dataset into train and test
    dataset_orig = GermanDataset(
        protected_attribute_names=[
            'age'
        ],  # this dataset also contains protected attribute for "sex" 
        # which we do not consider in this evaluation
        privileged_classes=[lambda x: x >= 25
                            ],  # age >=25 is considered privileged
        features_to_drop=['personal_status',
                          'sex']  # ignore sex-related attributes
    )
    dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7],
                                                               shuffle=True)
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)

    privileged_groups = [{'age': 1}]
    unprivileged_groups = [{'age': 0}]

    #Step 3 Compute fairness metric on original training dataset
    metric_orig_train = BinaryLabelDatasetMetric(
        dataset_orig_train,  #mean difference
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

    display(Markdown("#### Original training dataset"))
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f. AKA the privileged group is getting .17 more positive outcomes in the training dataset."
        % metric_orig_train.mean_difference())  #
    print()

    #metrics
    clf = RF()
    clf.fit(dataset_orig_train.features, dataset_orig_train.labels)

    predictions = clf.predict(dataset_orig_test.features)
    proba_predictions = clf.predict_proba(dataset_orig_test.features)

    dataset_orig_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1)
    dataset_orig_test_pred.labels = predictions.reshape(-1, 1)

    cm_pred_valid = CM(dataset_orig_test,
                       dataset_orig_test_pred,
                       unprivileged_groups=unprivileged_groups,
                       privileged_groups=privileged_groups)

    cm = ["precision", "recall", "accuracy"]

    metrics = {}
    for c in cm:
        metric = eval("cm_pred_valid." + c + "()")
        metrics[c] = metric

    metrics["recall"], metrics["accuracy"], metrics["precision"]

    print("AIF360 metrics")
    for key in ["recall", "accuracy", "precision"]:
        print("{} score is: {}".format(key, metrics[key]))

    #Step 4 Mitigate bias by transforming the original dataset
    RW = Reweighing(
        unprivileged_groups=
        unprivileged_groups,  #pre-processing mitigation algorithm
        privileged_groups=privileged_groups)
    dataset_transf_train = RW.fit_transform(dataset_orig_train)

    #Step 5 Compute fairness metric on transformed dataset
    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    display(Markdown("#### Transformed training dataset"))
    print(
        "Difference in mean outcomes between unprivileged and privileged groups = %f"
        % metric_transf_train.mean_difference())  #

    #metrics
    #split
    dataset_transf_train, dataset_transf_test = dataset_transf_train.split(
        [0.7], shuffle=True)
    dataset_transf_test_pred = dataset_transf_test.copy(deepcopy=True)

    clf = RF()
    clf.fit(dataset_transf_train.features, dataset_transf_train.labels)

    predictions = clf.predict(dataset_transf_test.features)
    proba_predictions = clf.predict_proba(dataset_transf_test.features)

    dataset_transf_test_pred.scores = proba_predictions[:, 0].reshape(-1, 1)
    dataset_transf_test_pred.labels = predictions.reshape(-1, 1)

    cm_pred_valid = CM(dataset_transf_test,
                       dataset_transf_test_pred,
                       unprivileged_groups=unprivileged_groups,
                       privileged_groups=privileged_groups)

    cm = ["precision", "recall", "accuracy"]

    metrics = {}
    for c in cm:
        metric = eval("cm_pred_valid." + c + "()")
        metrics[c] = metric

    metrics["recall"], metrics["accuracy"], metrics["precision"]

    print("AIF360 metrics")
    for key in ["recall", "accuracy", "precision"]:
        print("{} score is: {}".format(key, metrics[key]))
Ejemplo n.º 12
0
def train(request):
    df = pd.read_csv('./training/resume_data_5000.csv')
    df = df.drop(df.columns[0], axis=1)
    dataset_orig = StandardDataset(df,
                                   label_name='Accepted',
                                   favorable_classes=[1],
                                   protected_attribute_names=['Gender'],
                                   privileged_classes=[[1]],
                                   categorical_features=['School'],
                                   features_to_drop=['Name'])
    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7],
                                                             shuffle=True)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True)

    privileged_groups = [{'Gender': 1}]
    unprivileged_groups = [{'Gender': 0}]

    metric_orig_train = BinaryLabelDatasetMetric(
        dataset_orig_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    orig_mean_difference = metric_orig_train.mean_difference()

    with open('./training/orig_mean_difference.pkl', 'wb') as f:
        pickle.dump(orig_mean_difference, f)

    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    dataset_transf_train = RW.fit_transform(dataset_orig_train)
    metric_transf_train = BinaryLabelDatasetMetric(
        dataset_transf_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)
    transf_mean_difference = metric_transf_train.mean_difference()

    with open('./training/transf_mean_difference.pkl', 'wb') as f:
        pickle.dump(transf_mean_difference, f)

    # Logistic regression classifier and predictions
    scale_orig = StandardScaler()
    X_train = scale_orig.fit_transform(dataset_orig_train.features)
    y_train = dataset_orig_train.labels.ravel()
    w_train = dataset_orig_train.instance_weights.ravel()

    with open('./training/scaler.pkl', 'wb') as f:
        pickle.dump(scale_orig, f)

    lmod_orig = LogisticRegression(solver='lbfgs')
    lmod_orig.fit(X_train,
                  y_train,
                  sample_weight=dataset_orig_train.instance_weights)
    y_train_pred = lmod_orig.predict(X_train)

    pos_ind = np.where(
        lmod_orig.classes_ == dataset_orig_train.favorable_label)[0][0]

    dataset_orig_train_pred = dataset_orig_train.copy()
    dataset_orig_train_pred.labels = y_train_pred

    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    X_valid = scale_orig.transform(dataset_orig_valid_pred.features)
    y_valid = dataset_orig_valid_pred.labels
    dataset_orig_valid_pred.scores = lmod_orig.predict_proba(
        X_valid)[:, pos_ind].reshape(-1, 1)

    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale_orig.transform(dataset_orig_test_pred.features)
    y_test = dataset_orig_test_pred.labels
    dataset_orig_test_pred.scores = lmod_orig.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)

    num_thresh = 100
    ba_arr = np.zeros(num_thresh)
    class_thresh_arr = np.linspace(0.01, 0.99, num_thresh)
    for idx, class_thresh in enumerate(class_thresh_arr):

        fav_inds = dataset_orig_valid_pred.scores > class_thresh
        dataset_orig_valid_pred.labels[
            fav_inds] = dataset_orig_valid_pred.favorable_label
        dataset_orig_valid_pred.labels[
            ~fav_inds] = dataset_orig_valid_pred.unfavorable_label

        classified_metric_orig_valid = ClassificationMetric(
            dataset_orig_valid,
            dataset_orig_valid_pred,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        ba_arr[idx] = 0.5*(classified_metric_orig_valid.true_positive_rate()\
                        +classified_metric_orig_valid.true_negative_rate())

    best_ind = np.where(ba_arr == np.max(ba_arr))[0][0]
    best_class_thresh = class_thresh_arr[best_ind]

    bal_acc_arr_orig = []
    disp_imp_arr_orig = []
    avg_odds_diff_arr_orig = []

    for thresh in tqdm(class_thresh_arr):
        fav_inds = dataset_orig_test_pred.scores > thresh
        dataset_orig_test_pred.labels[
            fav_inds] = dataset_orig_test_pred.favorable_label
        dataset_orig_test_pred.labels[
            ~fav_inds] = dataset_orig_test_pred.unfavorable_label

        metric_test_bef = compute_metrics(dataset_orig_test,
                                          dataset_orig_test_pred,
                                          unprivileged_groups,
                                          privileged_groups,
                                          disp=False)

        if thresh == best_class_thresh:
            with open('./training/metrics_orig.pkl', 'wb') as f:
                pickle.dump(metric_test_bef,
                            f,
                            protocol=pickle.HIGHEST_PROTOCOL)

        bal_acc_arr_orig.append(metric_test_bef["Balanced accuracy"])
        avg_odds_diff_arr_orig.append(
            metric_test_bef["Average odds difference"])
        disp_imp_arr_orig.append(metric_test_bef["Disparate impact"])

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod_transf = LogisticRegression(solver='lbfgs')
    lmod_transf.fit(X_train,
                    y_train,
                    sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod_transf.predict(X_train)

    dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod_transf.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)

    bal_acc_arr_transf = []
    disp_imp_arr_transf = []
    avg_odds_diff_arr_transf = []

    for thresh in tqdm(class_thresh_arr):
        fav_inds = dataset_transf_test_pred.scores > thresh
        dataset_transf_test_pred.labels[
            fav_inds] = dataset_transf_test_pred.favorable_label
        dataset_transf_test_pred.labels[
            ~fav_inds] = dataset_transf_test_pred.unfavorable_label

        metric_test_aft = compute_metrics(dataset_orig_test,
                                          dataset_transf_test_pred,
                                          unprivileged_groups,
                                          privileged_groups,
                                          disp=False)

        if thresh == best_class_thresh:
            with open('./training/metrics_transf.pkl', 'wb') as f:
                pickle.dump(metric_test_aft,
                            f,
                            protocol=pickle.HIGHEST_PROTOCOL)

        bal_acc_arr_transf.append(metric_test_aft["Balanced accuracy"])
        avg_odds_diff_arr_transf.append(
            metric_test_aft["Average odds difference"])
        disp_imp_arr_transf.append(metric_test_aft["Disparate impact"])

    with open('./training/model_orig.pkl', 'wb') as f:
        pickle.dump(lmod_orig, f)
    with open('./training/model_transf.pkl', 'wb') as f:
        pickle.dump(lmod_transf, f)

    return HttpResponse('Model trained')
Ejemplo n.º 13
0
def svm_reweighing(training_data, test_data, fairness_metric, accuracy_metric,
                   C, gamma, keep_features, privileged_groups,
                   unprivileged_groups, max_iter, svm_seed):
    """
    Run SVM classifier with Reweighing preprocessing on specified data set,
    with provided parameters, and calculate fitness scores.

    :param training_data: The training data set to run the classifier on
    :param test_data: The test data set to test the classifier on
    :param fairness_metric: The fairness metric to calculate
    :param accuracy_metric: The accuracy metric to calculate
    :param C: The C parameter for SVC
    :param gamma: The gamma parameter for SVC
    :param keep_features: The features to keep for SVC
    :param privileged_groups: The privileged group in the data set
    :param unprivileged_groups: The unprivileged group in the data set
    :param max_iter: Max iterations for SVM
    :param svm_seed: Seed used for RNG in SVM
    :return: Return the accuracy and fairness score for the classifier
    """
    dataset_orig_train, dataset_orig_test = training_data, test_data

    # Run Reweighing
    rw = Reweighing(privileged_groups=privileged_groups,
                    unprivileged_groups=unprivileged_groups)
    dataset_transf_train = rw.fit_transform(dataset_orig_train)

    # Prepare data
    scale = StandardScaler()
    X_train = scale.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()
    w_train = dataset_transf_train.instance_weights
    dataset_transf_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale.fit_transform(dataset_transf_test_pred.features)
    if len(keep_features) > 0:  # If keep_features empty, use all features
        X_train = X_train[:, keep_features]
        X_test = X_test[:, keep_features]

    # Train
    clf = SVC(C=C,
              gamma=gamma,
              kernel='rbf',
              probability=True,
              max_iter=max_iter,
              random_state=svm_seed)
    clf.fit(X_train, y_train, sample_weight=w_train)

    # Test
    pos_ind = np.where(clf.classes_ == dataset_orig_train.favorable_label)[0][
        0]  # positive class index
    dataset_transf_test_pred.scores = clf.predict_proba(
        X_test)[:, pos_ind].reshape(-1, 1)
    # Assign labels
    fav_inds = dataset_transf_test_pred.scores > 0.5
    dataset_transf_test_pred.labels[
        fav_inds] = dataset_transf_test_pred.favorable_label
    dataset_transf_test_pred.labels[
        ~fav_inds] = dataset_transf_test_pred.unfavorable_label

    # Calculate metrics
    cm = ClassificationMetric(dataset_orig_test,
                              dataset_transf_test_pred,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

    accuracy_score = accuracy_metric(cm)
    fairness_score = fairness_metric(cm)
    return accuracy_score, fairness_score