Ejemplo n.º 1
0
def get_transformed_data(dataset='data/simulated_data.csv',
                         protected_attribute='group'):
    sample_data = pd.read_csv(dataset, header=0)

    pre_transform = BinaryLabelDataset(
        1.0,
        0.0,
        df=sample_data,
        label_names=['outcome'],
        protected_attribute_names=[protected_attribute])

    RW = Reweighing(unprivileged_groups=[{
        'group': 0
    }],
                    privileged_groups=[{
                        'group': 1
                    }])
    #    RW.fit(pre_transform)
    post_transform = RW.fit_transform(pre_transform)
    ds = post_transform.convert_to_dataframe()[0]
    X = ds.drop('outcome', axis=1)
    y = ds['outcome']
    return {
        'simulated_data': {
            'data': X.values,
            'labels': y.values,
            'participant_ids': np.arange(0, len(ds)),
            'feature_names': np.array([f for f in ds if f not in ['outcome']])
        }
    }
Ejemplo n.º 2
0
def test_instance_weights():
    ad = AdultDataset(instance_weights_name='fnlwgt', features_to_drop=[])
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    rw = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    transf = rw.fit_transform(ad)
    print(transf.instance_weights.sum())
    assert np.isclose(ad.instance_weights.sum(), transf.instance_weights.sum())
Ejemplo n.º 3
0
def main(argv):
    df_data = pd.read_csv(r"adults_dataset/adult_train.csv")
    df_data = name_columns(df_data)
    df_test = pd.read_csv(r"adults_dataset/adult_test.csv")
    df_test = name_columns(df_test)

    df_data = data_preprocessing(df_data)
    df_test = data_preprocessing(df_test)

    # fig_proportion_of_rich(df_test, argv[1], False)

    df_data_encoded = one_hot_encoding(df_data)
    df_test_encoded = one_hot_encoding(df_test)

    normalization(df_data_encoded)
    normalization(df_test_encoded)

    samples = split_samples(df_data_encoded, df_test_encoded)
    
    model = random_forest_classifier(samples)

    predictions = predict(model, samples, False)

    # proportion_of_rich(argv[2], samples, predictions, False)

    gender_performance(df_test_encoded, predictions)
    demographic_parity(df_test_encoded, predictions)
    equalized_odds(df_test_encoded, predictions)

    #Kamiran and Calders
    train_sds = StandardDataset(df_data_encoded, label_name="earnings", favorable_classes=[1], 
                                protected_attribute_names=["sex"], privileged_classes=[[1]])

    test_sds = StandardDataset(df_test_encoded, label_name="earnings", favorable_classes=[1],
                               protected_attribute_names=["sex"], privileged_classes=[[1]])

    privileged_groups = [{"sex": 1.0}]
    unprivileged_groups = [{"sex": 0.0}]

    RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    RW.fit(train_sds)

    test_sds_pred = test_sds.copy(deepcopy=True)
    test_sds_transf = RW.transform(test_sds)

    samples_fair = split_samples_fair(train_sds, test_sds, test_sds_pred)
    
    model_fair = logistic_regression(test_sds_transf)

    predictions_fair, test_pred = predict_fair(model_fair, samples_fair, True)
    test_pred = test_pred.astype(int)

    dpd = demographic_parity_difference(
        df_test_encoded.earnings, test_pred, sensitive_features=df_test_encoded.sex)

    print(f"Model demographic parity difference:", dpd)
Ejemplo n.º 4
0
def reweigh_and_predict(df1, df2):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = 5410  #len(df1)
    ntest = 1804  #len(df2)
    df1 = df
    #df = pandas.read_csv("compas.csv")
    df = pandas.get_dummies(df,
                            prefix=['sex', 'race', 'c_charge_degree'],
                            drop_first=True)
    df = df.rename(
        columns={
            'race_Non-White': 'race',
            'sex_Male': 'sex',
            'c_charge_degree_M': 'charge_degree'
        })
    # set up the BinaryLabelDataset
    label_names = ['two_year_recid']
    protected_attribute_names = ['race']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    train_data = BinaryLabelDataset(
        df=train_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)
    test_data = BinaryLabelDataset(
        df=test_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)

    privileged_groups = [{'race': 1}]
    unprivileged_groups = [{'race': 0}]
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    RW.fit(train_data)
    dataset_transf_train = RW.transform(train_data)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod.predict(X_train)

    dataset_transf_test_pred = test_data
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod.predict(X_test)
    Y_hat = dataset_transf_test_pred.scores

    return Y_hat
Ejemplo n.º 5
0
def reweigh_and_predict(df1, df2):
    # concatenate the data and clean it
    df = pandas.concat([df1, df2])
    ntrain = len(df1)
    ntest = len(df2)

    #df = pandas.read_csv("UCIAdult.csv")
    df = pandas.get_dummies(df, prefix = ['income', 'sex', 'native_country', 'marital_status',\
                              'workclass', 'occupation'], drop_first = True)
    df = df.rename(columns = {'income_>50K':'income', 'sex_Female':'sex', 'native_country_United-States':'native_country',\
    'marital_status_Not-Married':'marital_status'})
    #df = df.drop(columns = ['Unnamed: 0'])
    # set up the BinaryLabelDataset
    label_names = ['income']
    protected_attribute_names = ['sex']
    train_data = df.head(ntrain)
    test_data = df.tail(ntest)

    train_data = BinaryLabelDataset(
        df=train_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)
    test_data = BinaryLabelDataset(
        df=test_data,
        label_names=label_names,
        protected_attribute_names=protected_attribute_names)

    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    RW.fit(train_data)
    dataset_transf_train = RW.transform(train_data)

    scale_transf = StandardScaler()
    X_train = scale_transf.fit_transform(dataset_transf_train.features)
    y_train = dataset_transf_train.labels.ravel()

    lmod = LogisticRegression()
    lmod.fit(X_train,
             y_train,
             sample_weight=dataset_transf_train.instance_weights)
    y_train_pred = lmod.predict(X_train)

    dataset_transf_test_pred = test_data
    X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
    y_test = dataset_transf_test_pred.labels
    dataset_transf_test_pred.scores = lmod.predict(X_test)
    Y_hat = dataset_transf_test_pred.scores

    return Y_hat
Ejemplo n.º 6
0
def reweigh_and_predict(df1, df2):
  label_names = ['Y']
  protected_attribute_names = ['A']

  df = pandas.concat([df1, df2])
  ntrain = len(df1)
  ntest = len(df2)

  train_data = df.head(ntrain)
  test_data = df.tail(ntest)


  train_data = BinaryLabelDataset(df = train_data, label_names = label_names,
                                   protected_attribute_names = protected_attribute_names)
  test_data = BinaryLabelDataset(df = test_data, label_names = label_names,
                                   protected_attribute_names = protected_attribute_names)

  privileged_groups = [{'A': 0}]
  unprivileged_groups = [{'A': 1}]
  RW = Reweighing(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups)
  RW.fit(train_data)
  dataset_transf_train = RW.transform(train_data)

  scale_transf = StandardScaler()
  X_train = scale_transf.fit_transform(dataset_transf_train.features)
  y_train = dataset_transf_train.labels.ravel()

  lmod = LogisticRegression()
  lmod.fit(X_train, y_train,
        sample_weight=dataset_transf_train.instance_weights)
  y_train_pred = lmod.predict(X_train)

  dataset_transf_test_pred = test_data
  X_test = scale_transf.fit_transform(dataset_transf_test_pred.features)
  y_test = dataset_transf_test_pred.labels
  dataset_transf_test_pred.scores = lmod.predict_proba(X_test)[:,1:2].ravel()
  Y_hat = dataset_transf_test_pred.scores

  return Y_hat
Ejemplo n.º 7
0
def reweighing_data(train, unprivileged_group, privileged_group):
    RW = Reweighing(unprivileged_groups=unprivileged_group,
                    privileged_groups=privileged_group)
    RW.fit(train)
    train_transformed = RW.transform(train)

    # change weights to whole numbers
    for i in range(train_transformed.instance_weights.size):
        train_transformed.instance_weights[i] = (
            round(train_transformed.instance_weights[i] / 0.1) * 0.1) * 10
        weights = copy.deepcopy(train_transformed.instance_weights)

    # change train_transformed.features and train_transformed.labels and train_transformed.protected_attributes according to the weights of each instance
    for i in range(train_transformed.features.shape[0]):
        row = copy.deepcopy(train_transformed.features[i])
        row_label = copy.deepcopy(train_transformed.labels[i])
        row_protected_attributes = copy.deepcopy(
            train_transformed.protected_attributes[i])
        row_protected_attributes.resize(1, 2)
        row.resize(1, train_transformed.features.shape[1])
        row_label.resize(1, 1)
        weight = int(weights[i])
        for j in range(weight - 1):
            train_transformed.features = np.concatenate(
                (train_transformed.features, row))
            train_transformed.labels = np.concatenate(
                (train_transformed.labels, row_label))
            train_transformed.protected_attributes = np.concatenate(
                (train_transformed.protected_attributes,
                 row_protected_attributes))

    # change the train_transformed to a numpy array of ones to match number of rows in features
    train_transformed.instance_weights = np.ones(
        train_transformed.features.shape[0])

    return train_transformed
    def create_data():

        import pandas as pd
        from h2oaicore.models_utils import import_tensorflow
        tf = import_tensorflow()
        # above is because aif360 requires tensorflow
        from aif360.datasets import BinaryLabelDataset
        from aif360.algorithms.preprocessing.reweighing import Reweighing

        """
        Update the below as needed
        """
        #########
        #########
        #########
        # Path to the data
        folder_path = 'tmp/'
        # Data file
        data_file = 'housing_train_proc.csv'
        full_data_file = folder_path + data_file

        if not os.path.isfile(full_data_file):
            # for testing, just return something
            if config.hard_asserts:
                return dt.Frame(np.array([[1, 2, 3], [4, 5, 6]]))
            else:
                return []

        train = pd.read_csv(full_data_file)

        validation_test_files = ['housing_test_proc.csv']

        validation_split = [0.6, 0.8]

        # Target column
        target = 'high_priced'
        favorable_label = 0
        unfavorable_label = 1

        # Privleged_group_info  = [[Protetected group name 1, prevleged level, unprivleged level], [Protetected group name 2, prevleged level, unprivleged level]]
        # The protected group columns need to be binary
        protected_group_info = [['hispanic', 0, 1], ['black', 0, 1]]
        #########
        #########
        #########

        # Set up protected group info
        protected_groups = [group_info[0] for group_info in protected_group_info]

        dataset_orig = BinaryLabelDataset(df=train, label_names=[target], favorable_label=favorable_label,
                                          unfavorable_label=unfavorable_label,
                                          protected_attribute_names=protected_groups)

        privileged_groups = []
        unprivileged_groups = []
        for protected_group in protected_group_info:
            privileged_groups_dict = {}
            unprivileged_groups_dict = {}
            privileged_groups_dict[protected_group[0]] = protected_group[1]
            unprivileged_groups_dict[protected_group[0]] = protected_group[2]
            privileged_groups.append(privileged_groups_dict)
            unprivileged_groups.append(unprivileged_groups_dict)

        # Fit weights on the full dataset to be used on the external test set, if given
        RW_full = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW_full.fit(dataset_orig)

        # Split the original data into train, validation, and test if applicable
        if len(validation_split) == 1:
            dataset_orig_train, dataset_orig_valid = dataset_orig.split(validation_split, shuffle=True)
        elif len(validation_split) == 2:
            dataset_orig_train_valid, dataset_orig_test = dataset_orig.split([validation_split[1]], shuffle=True)
            # Fit the weights on both the validation and test set for the test set split
            RW_train_valid = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
            RW_train_valid.fit(dataset_orig_train_valid)
            dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split(
                [validation_split[0] / (validation_split[1])], shuffle=True)
        else:
            dataset_orig_train = dataset_orig

        # Fit weights on the training set only    
        RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        RW.fit(dataset_orig_train)
        dataset_transf_train = RW.transform(dataset_orig_train)

        # Add the weigts to the training set
        train_df = pd.DataFrame(dataset_transf_train.features, columns=dataset_transf_train.feature_names)
        train_df[target] = dataset_transf_train.labels.ravel()
        train_df['weights'] = dataset_transf_train.instance_weights.ravel()

        # Create datasets with minimum features calculated the given number of days ahead
        dataset_dict = {}
        dataset_dict[data_file.split('.')[0] + "_rw_train.csv"] = train_df

        # Add weights to the validation split (if a validation split was specified)
        if len(validation_split) >= 1:
            dataset_transf_valid = RW.transform(dataset_orig_valid)
            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_validation.csv"] = valid_df

        # Add weights to the test split (if a test split was specified)
        if len(validation_split) >= 2:
            dataset_transf_test = RW_train_valid.transform(dataset_orig_test)
            test_df = pd.DataFrame(dataset_transf_test.features, columns=dataset_transf_test.feature_names)
            test_df[target] = dataset_transf_test.labels.ravel()
            test_df['weights'] = dataset_transf_test.instance_weights.ravel()
            dataset_dict[data_file.split('.')[0] + "_rw_test.csv"] = test_df

        # Add weights to the test files (If provided)       
        for valid_file in validation_test_files:
            valid = pd.read_csv(folder_path + valid_file)
            dataset_valid_orig = BinaryLabelDataset(df=valid, label_names=[target], favorable_label=favorable_label,
                                                    unfavorable_label=unfavorable_label,
                                                    protected_attribute_names=protected_groups)
            dataset_transf_valid = RW_full.transform(dataset_valid_orig)

            valid_df = pd.DataFrame(dataset_transf_valid.features, columns=dataset_transf_valid.feature_names)
            valid_df[target] = dataset_transf_valid.labels.ravel()
            valid_df['weights'] = dataset_transf_valid.instance_weights.ravel()

            dataset_dict[valid_file.split('.')[0] + "_rw_transformed.csv"] = valid_df

        return dataset_dict
Ejemplo n.º 9
0
def calculate(pre_process,in_process,post_process,dataset_original,privileged_groups,unprivileged_groups,optim_options,in_process_epochs):

	dataset_original_train, dataset_original_test = dataset_original.split([0.3], shuffle=True)

	min_max_scaler=MinMaxScaler()
	dataset_original_train.features=min_max_scaler.fit_transform(dataset_original_train.features)
	dataset_original_test.features=min_max_scaler.transform(dataset_original_test.features)

	#Pre-processing begin
	dataset_after_pre_train=copy.deepcopy(dataset_original_train)
	dataset_after_pre_test=copy.deepcopy(dataset_original_test)
	if pre_process==0:
		pass
	if pre_process==1:
		pre_DIR=DisparateImpactRemover(repair_level=1.0)
		dataset_after_pre_train=pre_DIR.fit_transform(dataset_after_pre_train)
		dataset_after_pre_test=pre_DIR.fit_transform(dataset_after_pre_test)
	if pre_process==2:
		pre_LFR=LFR(unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
		pre_LFR.fit(dataset_after_pre_train)
		dataset_after_pre_train=pre_LFR.transform(dataset_after_pre_train)
		dataset_after_pre_test=pre_LFR.transform(dataset_after_pre_test)
	if pre_process==3:
		pre_OP=OptimPreproc(OptTools,optim_options,unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
		pre_OP.fit(dataset_original_train)
		dataset_after_pre_train=pre_OP.transform(dataset_original_train,transform_Y=True)
		dataset_after_pre_test=pre_OP.transform(dataset_original_test,transform_Y=True)
	if pre_process==4:
		pre_RW=Reweighing(unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
		pre_RW.fit(dataset_original_train)
		dataset_after_pre_train=pre_RW.transform(dataset_original_train)
		dataset_after_pre_test=pre_RW.transform(dataset_original_test)
	#Pre-processing end

	report=get_metric_reports(
		true_dataset=dataset_original_test,
		classfied_dataset=dataset_after_pre_test,
		privileged_groups=privileged_groups,
		unprivileged_groups=unprivileged_groups
	)
	# print('After Pre-process:')
	# print(report)

	#In-processing begin
	dataset_after_in_train=copy.deepcopy(dataset_after_pre_train)
	dataset_after_in_test=copy.deepcopy(dataset_after_pre_test)
	if in_process==0:
		sess = tf.Session()
		in_PM=PlainModel(
			privileged_groups=privileged_groups,
			unprivileged_groups=unprivileged_groups,
			scope_name='plain_classifier',
			num_epochs=in_process_epochs,
			sess=sess)
		in_PM.fit(dataset_after_in_train)
		dataset_after_in_train=in_PM.predict(dataset_after_in_train)
		dataset_after_in_test=in_PM.predict(dataset_after_in_test)
		sess.close()
		tf.reset_default_graph()
	if in_process==1:
		sess = tf.Session()
		in_AD=AdversarialDebiasing(
			privileged_groups=privileged_groups,
			unprivileged_groups=unprivileged_groups,
			scope_name='debiased_classifier',
			num_epochs=in_process_epochs,
			debias=True,
			sess=sess)
		in_AD.fit(dataset_after_in_train)
		dataset_after_in_train=in_AD.predict(dataset_after_in_train)
		dataset_after_in_test=in_AD.predict(dataset_after_in_test)
		sess.close()
		tf.reset_default_graph()
	if in_process==2:
		in_ART=ARTClassifier(SklearnClassifier(model=LogisticRegression(max_iter=in_process_epochs)))
		in_ART.fit(dataset_after_in_train)
		dataset_after_in_train=in_ART.predict(dataset_after_in_train)
		dataset_after_in_test=in_ART.predict(dataset_after_in_test)
	if in_process==3:
		sens_attr=list(privileged_groups[0].keys())[0]
		in_PM=PrejudiceRemover(sensitive_attr=sens_attr,eta=25.0)
		in_PM.fit(dataset_after_in_train)
		dataset_after_in_train=in_PM.predict(dataset_after_in_train)
		dataset_after_in_test=in_PM.predict(dataset_after_in_test)
	#In-process end

	report=get_metric_reports(
		true_dataset=dataset_original_test,
		classfied_dataset=dataset_after_in_test,
		privileged_groups=privileged_groups,
		unprivileged_groups=unprivileged_groups
	)
	# print('After In-process:')
	# print(report)

	#Post-process begin
	dataset_after_post_train=copy.deepcopy(dataset_after_in_train)
	dataset_after_post_test=copy.deepcopy(dataset_after_in_test)
	if post_process==0:
		pass
	if post_process==1:
		post_CEO=CalibratedEqOddsPostprocessing(
			privileged_groups=privileged_groups,
			unprivileged_groups=unprivileged_groups)
		post_CEO.fit(dataset_true=dataset_after_pre_train,dataset_pred=dataset_after_in_train)
		dataset_after_post_train=post_CEO.predict(dataset_after_post_train)
		dataset_after_post_test=post_CEO.predict(dataset_after_post_test)
	if post_process==2:
		post_EO=EqOddsPostprocessing(unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
		post_EO.fit(dataset_true=dataset_after_pre_train,dataset_pred=dataset_after_in_train)
		dataset_after_post_train=post_EO.predict(dataset_after_post_train)
		dataset_after_post_test=post_EO.predict(dataset_after_post_test)
	if post_process==3:
		metric_ub=0.05
		metric_lb=-0.05
		post_ROC=RejectOptionClassification(
			unprivileged_groups=unprivileged_groups, 
			privileged_groups=privileged_groups,
			low_class_thresh=0.01, high_class_thresh=0.99,
			num_class_thresh=100, num_ROC_margin=50,
			metric_name="Statistical parity difference",
			metric_ub=metric_ub, metric_lb=metric_lb)
		post_ROC.fit(dataset_true=dataset_after_pre_train,dataset_pred=dataset_after_in_train)
		dataset_after_post_train=post_ROC.predict(dataset_after_post_train)
		dataset_after_post_test=post_ROC.predict(dataset_after_post_test)
	#Post-processing end

	#Measuring unfairness begin
	report=get_metric_reports(
		true_dataset=dataset_original_test,
		classfied_dataset=dataset_after_post_test,
		privileged_groups=privileged_groups,
		unprivileged_groups=unprivileged_groups
	)

	# print('After Post-process:')
	# print(report)

	return report
Ejemplo n.º 10
0
    #*******************
    # Step 3 - Train machine Learning classifier and plot results
    if classifier_choice == "Logistic Regression":
        lambda_values = np.logspace(0,10, num=50)
    else:
        lambda_values = np.logspace(0,3, num=50)
    accuracy_list, equal_opp_list, stat_parity_list = fit_classifier(classifier_choice, train.instance_weights, lambda_values, 
                                                    X_train, y_train, X_test, y_test, test_pred)
    if FLAG_PLOTS:
        plot_analysis('{}_unweighted_{}'.format(data_choice, classifier_choice), lambda_values, accuracy_list, equal_opp_list,
                        "Equal Opport. Difference", stat_parity_list, "Statistical Parity")

    #*******************
    # Step 5 - Perform Reweighing, fit classifiers and plot results
    RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    train = RW.fit_transform(train)
    accuracy_list, equal_opp_list, stat_parity_list = fit_classifier(classifier_choice, train.instance_weights, lambda_values, 
                                                    X_train, y_train, X_test, y_test, test_pred)
    if FLAG_PLOTS:
        plot_analysis('{}_weighted_{}'.format(data_choice,classifier_choice), lambda_values, accuracy_list, equal_opp_list, 
                        "Equal Opport. Difference", stat_parity_list, "Statistical Parity")
        ax = sns.distplot(train.instance_weights, kde=False)
        ax.set_xlabel(r'Range of Weight')
        ax.set_ylabel('Frequency')
        plt.savefig('{}_reweighted.png'.format(data_choice), bbox_inches='tight')    
        # plt.show()
        plt.clf()

    #*******************
    # Step 6 - Perform k random  train/test splits and report results
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())


# In[ ]:



new_dataset.protected_attribute_names


# In[ ]:


RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(new_dataset)
dataset_transf = RW.transform(new_dataset)


# In[ ]:


metric_orig_train = BinaryLabelDatasetMetric(dataset_transf, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Modified training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())


# In[ ]:
Ejemplo n.º 12
0
te[0].to_csv(output_path + 'taiwan_' + 'scaled_test' + '.csv',
             index=None,
             header=True)
val[0].to_csv(output_path + 'taiwan_' + 'scaled_valid' + '.csv',
              index=None,
              header=True)
# =============================================================================

# Preprocessing
methods = [
    "reweighing"  #, "disp_impact_remover"
]

for m in methods:
    if m == "reweighing":
        RW = Reweighing(unprivileged_groups=unprivileged_groups,
                        privileged_groups=privileged_groups)
        RW.fit(dataset_orig_train)

        # train classification
        dataset_transf_train = RW.transform(dataset_orig_train)
        w_train = dataset_transf_train.instance_weights.ravel()
        out_train = dataset_transf_train.convert_to_dataframe(
            de_dummy_code=True, sep='=', set_category=True)[0]
        out_train = out_train.sample(n=out_train.shape[0],
                                     replace=True,
                                     weights=w_train)

        # valid classification
        dataset_transf_valid = RW.transform(dataset_orig_valid)
        w_valid = dataset_transf_valid.instance_weights.ravel()
        out_valid = dataset_transf_valid.convert_to_dataframe(
Ejemplo n.º 13
0
    def run(self):
        data_train, data_test = self.data_prepare()

        privileged_groups = [{self.target_attribute: 1}]
        unprivileged_groups = [{self.target_attribute: 0}]
        if self.fair_balance == "FairBalance":
            dataset_transf_train = FairBalance(data_train, class_balance=False)
        elif self.fair_balance == "FairBalanceClass":
            dataset_transf_train = FairBalance(data_train, class_balance=True)
        elif self.fair_balance == "Reweighing":
            RW = Reweighing(unprivileged_groups=unprivileged_groups,
                            privileged_groups=privileged_groups)
            RW.fit(data_train)
            dataset_transf_train = RW.transform(data_train)
        else:
            dataset_transf_train = data_train

        if self.fair_balance == "AdversialDebiasing":
            tf.reset_default_graph()
            sess = tf.Session()
            self.model = AdversarialDebiasing(
                privileged_groups=privileged_groups,
                unprivileged_groups=unprivileged_groups,
                scope_name='debiased_classifier',
                debias=True,
                sess=sess)
            self.model.fit(dataset_transf_train)
            preds = self.model.predict(data_test).labels.ravel()
            sess.close()
        else:
            scale_orig = StandardScaler()
            X_train = scale_orig.fit_transform(dataset_transf_train.features)
            y_train = dataset_transf_train.labels.ravel()

            self.model.fit(X_train,
                           y_train,
                           sample_weight=dataset_transf_train.instance_weights)

            X_test = scale_orig.transform(data_test.features)
            preds = self.model.predict(X_test)

        if self.fair_balance == "RejectOptionClassification":
            pos_ind = numpy.where(self.model.classes_ ==
                                  dataset_transf_train.favorable_label)[0][0]
            data_train_pred = dataset_transf_train.copy(deepcopy=True)
            data_train_pred.scores = self.model.predict_proba(
                X_train)[:, pos_ind].reshape(-1, 1)
            data_test_pred = data_test.copy(deepcopy=True)
            data_test_pred.scores = self.model.predict_proba(
                X_test)[:, pos_ind].reshape(-1, 1)
            metric_name = "Statistical parity difference"
            metric_ub = 0.05
            metric_lb = -0.05
            ROC = RejectOptionClassification(
                unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups,
                low_class_thresh=0.01,
                high_class_thresh=0.99,
                num_class_thresh=100,
                num_ROC_margin=50,
                metric_name=metric_name,
                metric_ub=metric_ub,
                metric_lb=metric_lb)
            try:
                ROC.fit(dataset_transf_train, data_train_pred)
            except:
                return None
            preds = ROC.predict(data_test_pred).labels.ravel()

        y_test = data_test.labels.ravel()
        result = self.evaluate(numpy.array(preds), y_test, data_test)
        return result
Ejemplo n.º 14
0
def k_fold_statistics(k_folds, classifier, lambda_values, dataset, unprivileged_groups, privileged_groups):
    '''
    Function to fit classifier to k number of random train/test splits
    
    Args:
        k_folds: number of folds of statistics
        classifier: SVM or Logistic regression
        weights: weights for each sample
        lambda_value: selected level of regularisation
        dataset: dataset to be used

    Returns: 
        accuracy_list: test accuracy for each model
        equal_opp_list: Equal Opportunity difference for each model
        stat_parity_list: Statistical Parity difference for each model
    '''

    accuracy_list = []
    equal_opp_list = []
    stat_parity_list = []

    for k in range(k_folds):
        train, test = dataset_orig.split([0.8], shuffle=True)
        train, validation = train.split([0.8], shuffle=True)
        scale_orig = StandardScaler()
        X_train = scale_orig.fit_transform(train.features)
        y_train = train.labels.ravel()
        X_test = scale_orig.transform(test.features)
        y_test = validation.labels.ravel()
        X_valid = scale_orig.transform(validation.features)
        y_valid = test.labels.ravel()        
        test_pred = test.copy() 
        valid_pred = validation.copy()

        RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        
        best_mean_statistic = 0
        
        # fit all candidate models
        for lambda_value in lambda_values:
            train = RW.fit_transform(train)
            if classifier == "Logistic Regression":
                learner = LogisticRegression(solver='liblinear', random_state=1, penalty='l2', C=1/lambda_value)  
            else:
                learner = svm.SVC(C=1/lambda_value)  
            learner.fit(X_train,y_train, sample_weight=train.instance_weights)
            valid_pred.labels = learner.predict(X_valid)
            metric = ClassificationMetric(validation, valid_pred, unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)
            mean_statistic = (1-abs(metric.equal_opportunity_difference())+metric.accuracy())/2
            if mean_statistic > best_mean_statistic:
                best_learner = learner

        test_pred.labels = best_learner.predict(X_test)
        metric = ClassificationMetric(test, test_pred, unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)
        print("----------------")
        print("Split {}/{}".format(k, k_folds))
        print("Equal opportunity:", "{0:.3f}".format(metric.equal_opportunity_difference()))
        print("Statistical parity:", "{0:.3f}".format(metric.statistical_parity_difference()))
        print("Accuracy:", "{0:.3f}".format(metric.accuracy()))
        accuracy_list.append(metric.accuracy())
        equal_opp_list.append(metric.equal_opportunity_difference())
        stat_parity_list.append(metric.statistical_parity_difference())

    accuracy_list = np.array(accuracy_list)
    equal_opp_list = np.array(equal_opp_list)
    stat_parity_list = np.array(stat_parity_list)
    print('The mean statistics for {} folds is:'.format(k_folds))
    print("Mean Accuracy: {0:.3f},".format(np.mean(accuracy_list)), "Std: {0:.3f}".format(np.std(accuracy_list)))
    print("Mean Equal Opportunity: {0:.3f},".format(np.mean(equal_opp_list)), "Std: {0:.3f}".format( np.std(equal_opp_list))) 
    print("Mean Statistical Parity: {0:.3f},".format(np.mean(stat_parity_list)), "Std: {0:.3f}".format(np.std(stat_parity_list)))
    
    return accuracy_list, equal_opp_list, stat_parity_list
def run(df, protected_attribute, label, columns_num, dataset_name, unprivileged_groups, 
        privileged_groups, metadata):
    df_train, df_test = train_test_split(df, train_size=0.8, shuffle=True)
    print(df_test.shape)
    df_test = df_test[:1000]
    print(df_test.shape)
    df_train, df_test = standard_scaler(df_train, df_test, columns_num)
    df_train = df_train.set_index(protected_attribute).reset_index()
    df_test = df_test.set_index(protected_attribute).reset_index()
    df_train.to_csv('data/{}_train.csv'.format(dataset_name), index=False)
    df_test.to_csv('data/{}_test.csv'.format(dataset_name), index=False)
    X_train, y_train = df_train.drop(label, axis=1), df_train[label]
    X_test, y_test = df_test.drop(label, axis=1), df_test[label]
    df_train_aif, df_test_aif = create_df_aif(df_train, df_test, label, 
                                          protected_attribute, metadata)

    df_train_us_dmin1 = fairness.fairCorrectUnder(df_train, pa=protected_attribute, label=label, fav=1, d=-1)
    df_train_us_dmin1.to_csv('data/{}_train_usd-1.csv'.format(dataset_name), index=False)
    X_train_us_dmin1, y_train_us_dmin1 = df_train_us_dmin1.drop(label, axis=1), df_train_us_dmin1[label]

    df_train_us_d0 = fairness.fairCorrectUnder(df_train, pa=protected_attribute, label=label, fav=1, d=0)
    df_train_us_d0.to_csv('data/{}_train_usd0.csv'.format(dataset_name), index=False)
    X_train_us_d0, y_train_us_d0 = df_train_us_d0.drop(label, axis=1), df_train_us_d0[label]

    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    RW.fit(df_train_aif)
    df_train_aif_rw = RW.transform(df_train_aif)
    weights = df_train_aif_rw.instance_weights

    result = []
    dict_models = {'lr': LogisticRegression(),
                   'gb': GradientBoostingClassifier(subsample=0.9),
                   'rf': RandomForestClassifier(max_depth=5, min_samples_leaf=2),
                   'svm': SVC(probability=True)}

    for model_name, model in dict_models.items():

        # Model with bias
        method_name = 'orig'
        model.fit(X_train, y_train)
        res = fairness.compute_metrics(model, X_test, y_test, X_train, y_train, df_test_aif, 
                                          unprivileged_groups, privileged_groups, protected_attribute, False)
        name = '_'.join([dataset_name, model_name, method_name])
        res['name'] = name
        result.append(res)
        pickle.dump(model, open('models/{}.pkl'.format(name), 'wb'))

        # Model with undersampling with d=-1
        method_name = 'usd-1'
        model.fit(X_train_us_dmin1, y_train_us_dmin1)
        res = fairness.compute_metrics(model, X_test, y_test, X_train, y_train, df_test_aif, 
                                       unprivileged_groups, privileged_groups, protected_attribute, False)
        name =  '_'.join([dataset_name, model_name, method_name])
        res['name'] = name
        result.append(res)
        pickle.dump(model, open('models/{}.pkl'.format(name), 'wb'))

        # Model with undersampling with d=0
        method_name = 'usd0'
        model.fit(X_train_us_d0, y_train_us_d0)
        res = fairness.compute_metrics(model, X_test, y_test, X_train, y_train, df_test_aif, 
                                       unprivileged_groups, privileged_groups, protected_attribute, False)
        name = '_'.join([dataset_name, model_name, method_name])
        res['name'] = name
        result.append(res)
        pickle.dump(model, open('models/{}.pkl'.format(name), 'wb'))

        # Model with reweghing
        method_name = 'rw'
        model.fit(X_train, y_train, sample_weight=weights)
        res = fairness.compute_metrics(model, X_test, y_test, X_train, y_train, df_test_aif, 
                                       unprivileged_groups, privileged_groups, protected_attribute, False)
        name = '_'.join([dataset_name, model_name, method_name])
        res['name'] = name
        result.append(res)
        pickle.dump(model, open('models/{}.pkl'.format(name), 'wb'))

    df_result = pd.DataFrame(result)
    df_result = df_result.set_index('name').reset_index()
    df_result.to_csv('data/result_fairness_{}.csv'.format(dataset_name), index=False)
    return df_result
Ejemplo n.º 16
0
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult

privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]

cycles = []
metric_dataset_debiasing_train = []
metric_dataset_debiasing_test = []
metric_dataset_reweigh_train = []
metric_dataset_reweigh_test = []
dataset_orig = load_preproc_data_adult()

for i in range(10):
    train1, test1 = dataset_orig.split([0.7], shuffle=True)
    RW = Reweighing(unprivileged_groups=unprivileged_groups,
                   privileged_groups=privileged_groups)
    RW.fit(train1)
    dataset_transf_train = RW.transform(train1)
    sess = tf.Session()
    debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                              unprivileged_groups = unprivileged_groups,
                              scope_name='debiased_classifier',
                              debias=True,
                              sess=sess)
    debiased_model.fit(train1)
    dataset_debiasing_train = debiased_model.predict(train1)
    dataset_debiasing_test = debiased_model.predict(test1)
    metric_debiasing_train = BinaryLabelDatasetMetric(dataset_debiasing_train, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
    metric_debiasing_test = BinaryLabelDatasetMetric(dataset_debiasing_test, 
Ejemplo n.º 17
0
def run_trial():

    # stores each run (4 algos without reweighing, 4 algos with reweighing) as a sublist
    # number of sublists = number of runs
    # each sublist has four elements
    # we ONLY predict on the testing data

    ########## WITHOUT REWEIGHING #############
    stat_par = []
    disp_imp = []
    eq_opp_diff = []
    avg_odds_diff = []
    theil = []
    acc = []

    ########## WITH REWEIGHING #############
    stat_par_reweigh = []
    disp_imp_reweigh = []
    eq_opp_diff_reweigh = []
    avg_odds_diff_reweigh = []
    theil_reweigh = []
    acc_reweigh = []

    ###########################################

    for i in range(10):

        ###########################################
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
        dataset_orig = load_preproc_data_adult()
        # train1, test1 are the original dataset
        train1, test1 = dataset_orig.split([0.7], shuffle=True)
        RW = Reweighing(unprivileged_groups=unprivileged_groups,
                        privileged_groups=privileged_groups)
        RW.fit(train1)
        # dataset_transf_train, test1 are for the reweighed dataset
        dataset_transf_train = RW.transform(train1)

        ###########################################

        # change weights to whole numbers
        for i in range(dataset_transf_train.instance_weights.size):
            dataset_transf_train.instance_weights[i] = (round(
                dataset_transf_train.instance_weights[i] / 0.1) * 0.1) * 10
        weights = copy.deepcopy(dataset_transf_train.instance_weights)

        # change dataset_transf_train.features and dataset_transf_train.labels and dataset_transf_train.protected_attributes according to the weights of each instance
        sum_weights = 0
        for i in range(dataset_transf_train.features.shape[0]):
            row = copy.deepcopy(dataset_transf_train.features[i])
            row_label = copy.deepcopy(dataset_transf_train.labels[i])
            row_protected_attributes = copy.deepcopy(
                dataset_transf_train.protected_attributes[i])
            row_protected_attributes.resize(1, 2)
            row.resize(1, 18)
            row_label.resize(1, 1)
            weight = int(weights[i])
            for j in range(weight - 1):
                dataset_transf_train.features = np.concatenate(
                    (dataset_transf_train.features, row))
                dataset_transf_train.labels = np.concatenate(
                    (dataset_transf_train.labels, row_label))
                dataset_transf_train.protected_attributes = np.concatenate(
                    (dataset_transf_train.protected_attributes,
                     row_protected_attributes))

        # change the dataset_transf_train to a numpy array of ones to match number of rows in features
        dataset_transf_train.instance_weights = np.ones(
            dataset_transf_train.features.shape[0])

        ################## without reweighing ##########################

        temp_stat_par = []
        temp_disp_imp = []
        temp_eq_opp_diff = []
        temp_avg_odds_diff = []
        temp_theil = []
        temp_acc = []

        ##################### adversarial debiasing #####################
        sess = tf.Session()
        debiased_model = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name='debiased_classifier',
            debias=True,
            sess=sess)
        debiased_model.fit(train1)
        dataset_debiasing_test = debiased_model.predict(test1)
        sess.close()
        tf.reset_default_graph()

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_debiasing_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_debiasing_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### prejudice remover #####################

        prejudice_model = PrejudiceRemover(eta=100, sensitive_attr='sex')
        prejudice_model.fit(train1)
        dataset_prejudice_test = prejudice_model.predict(test1)

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_prejudice_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_prejudice_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### normal neural net #####################

        sess = tf.Session()
        neural_model = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name='debiased_classifier',
            debias=False,
            sess=sess)
        neural_model.fit(train1)
        dataset_neural_test = neural_model.predict(test1)
        sess.close()
        tf.reset_default_graph()

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_neural_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_neural_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### ensemble #####################

        pred_labels_test = []
        for i in range(0, len(test1.features)):
            arr_test = mode([
                dataset_debiasing_test[i], dataset_prejudice_test[i],
                dataset_neural_test[i]
            ])
            pred_labels_test.append(arr_test[0][0])
        dataset_ensemble_test = test1.copy()
        dataset_ensemble_test.labels = np.array(pred_labels_test)

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_ensemble_train,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_ensemble_train,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ######### DUMP SHIT ###########

        stat_par.append(temp_stat_par)
        disp_imp.append(temp_disp_imp)
        eq_opp_diff.append(temp_eq_opp_diff)
        avg_odds_diff.append(temp_avg_odds_diff)
        theil.append(temp_theil)
        acc.append(temp_acc)

        ################## with reweighing ##########################

        temp_stat_par = []
        temp_disp_imp = []
        temp_eq_opp_diff = []
        temp_avg_odds_diff = []
        temp_theil = []
        temp_acc = []

        ################## adversarial debiasing ##################
        sess = tf.Session()
        debiased_model_reweighing = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name='debiased_classifier',
            debias=True,
            sess=sess)
        debiased_model_reweighing.fit(dataset_transf_train)
        dataset_debiasing_test_reweighing = debiased_model_reweighing.predict(
            test1)
        sess.close()
        tf.reset_default_graph()

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_debiasing_test_reweighing,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_debiasing_test_reweighing,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### prejudice remover #####################
        prejudice_model_reweighing = PrejudiceRemover(eta=100,
                                                      sensitive_attr='sex')
        prejudice_model_reweighing.fit(dataset_transf_train)
        dataset_prejudice_test_reweighing = prejudice_model_reweighing.predict(
            test1)

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_prejudice_test_reweighing,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_prejudice_test_reweighing,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### normal neural net #####################
        sess = tf.Session()
        neural_model = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name='debiased_classifier',
            debias=False,
            sess=sess)
        neural_model.fit(dataset_transf_train)
        dataset_neural_test = neural_model.predict(test1)
        sess.close()
        tf.reset_default_graph()

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_neural_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_neural_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ##################### ensemble #####################
        pred_labels_test = []
        for i in range(0, len(test1.features)):
            arr_test = mode([
                dataset_debiasing_test[i], dataset_prejudice_test[i],
                dataset_neural_test[i]
            ])
            pred_labels_test.append(arr_test[0][0])
        dataset_ensemble_test = test1.copy()
        dataset_ensemble_test.labels = np.array(pred_labels_test)

        ##################### metrics #####################

        metric_test = BinaryLabelDatasetMetric(
            dataset_ensemble_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        acc_test = ClassificationMetric(
            test1,
            dataset_ensemble_test,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)
        temp_stat_par.append(metric_test.mean_difference())
        temp_disp_imp.append(metric_test.disparate_impact())
        temp_eq_opp_diff.append(metric_test.equal_opportunity_difference())
        temp_avg_odds_diff.append(metric_test.average_odds_difference())
        temp_theil.append(metric_test.theil_index())
        temp_acc.append(acc_test.accuracy())

        ######### DUMP SHIT ###########

        stat_par_reweigh.append(temp_stat_par)
        disp_imp_reweigh.append(temp_disp_imp)
        eq_opp_diff_reweigh.append(temp_eq_opp_diff)
        avg_odds_diff_reweigh.append(temp_avg_odds_diff)
        theil_reweigh.append(temp_theil)
        acc_reweigh.append(temp_acc)

    without_reweighing = [
        stat_par, disp_imp, eq_opp_diff, avg_odds_diff, theil, acc
    ]
    with_reweighing = [
        stat_par_reweigh, disp_imp_reweigh, eq_opp_diff_reweigh,
        avg_odds_diff_reweigh, theil_reweigh, acc_reweigh
    ]

    for metric in range(len(without_reweighing)):
        name = "metric" + str(metric)
        sublist = without_reweighing[metric]
        with open(name, "wb") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerows(sublist)

    for metric in range(len(with_reweighing)):
        name = "metric" + str(metric) + "reweigh"
        sublist = with_reweighing[metric]
        with open(name, "wb") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerows(sublist)
Ejemplo n.º 18
0
def Pre(algorithm_used, dataset_orig_train, dataset_orig_valid,
        dataset_orig_test, privileged_groups, unprivileged_groups,
        optim_options):

    if algorithm_used == "disparate_impact_remover":
        '''
        scaler = MinMaxScaler(copy=False)
        dataset_orig_train.features = scaler.fit_transform(dataset_orig_train.features)
        dataset_orig_valid.features = scaler.fit_transform(dataset_orig_valid.features)
        dataset_orig_test.features = scaler.fit_transform(dataset_orig_test.features)
        '''
        DIC = DisparateImpactRemover(repair_level=1.0)
        dataset_transf_train = DIC.fit_transform(dataset_orig_train)
        dataset_transf_train = dataset_orig_train.align_datasets(
            dataset_transf_train)
        dataset_transf_valid = DIC.fit_transform(dataset_orig_valid)
        dataset_transf_valid = dataset_orig_valid.align_datasets(
            dataset_transf_valid)
        dataset_transf_test = DIC.fit_transform(dataset_orig_test)
        dataset_transf_test = dataset_orig_test.align_datasets(
            dataset_transf_test)

    elif algorithm_used == "lfr":
        TR = LFR(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups)
        TR.fit(dataset_orig_train)
        dataset_transf_train = TR.transform(dataset_orig_train)
        dataset_transf_train = dataset_orig_train.align_datasets(
            dataset_transf_train)

        dataset_transf_valid = TR.transform(dataset_orig_valid)
        dataset_transf_valid = dataset_orig_valid.align_datasets(
            dataset_transf_valid)

        dataset_transf_test = TR.transform(dataset_orig_test)
        dataset_transf_test = dataset_orig_test.align_datasets(
            dataset_transf_test)

    elif algorithm_used == "optim":
        OP = OptimPreproc(OptTools,
                          optim_options,
                          unprivileged_groups=unprivileged_groups,
                          privileged_groups=privileged_groups)
        OP.fit(dataset_orig_train)
        dataset_transf_train = OP.transform(dataset_orig_train)
        dataset_transf_train = dataset_orig_train.align_datasets(
            dataset_transf_train)

        dataset_transf_valid = OP.transform(dataset_orig_valid)
        dataset_transf_valid = dataset_orig_valid.align_datasets(
            dataset_transf_valid)

        dataset_transf_test = OP.transform(dataset_orig_test)
        dataset_transf_test = dataset_orig_test.align_datasets(
            dataset_transf_test)

    elif algorithm_used == "reweighing":
        RW = Reweighing(unprivileged_groups=unprivileged_groups,
                        privileged_groups=privileged_groups)
        RW.fit(dataset_orig_train)
        dataset_transf_train = RW.transform(dataset_orig_train)
        dataset_transf_train = dataset_orig_train.align_datasets(
            dataset_transf_train)

        dataset_transf_valid = RW.transform(dataset_orig_valid)
        dataset_transf_valid = dataset_orig_valid.align_datasets(
            dataset_transf_valid)

        dataset_transf_test = RW.transform(dataset_orig_test)
        dataset_transf_test = dataset_orig_test.align_datasets(
            dataset_transf_test)

    #dataset_transf_train.labels = dataset_orig_train.labels
    #dataset_transf_valid.labels = dataset_orig_valid.labels
    dataset_transf_test.labels = dataset_orig_test.labels
    #dataset_transf_test.scores = dataset_orig_test.scores

    return dataset_transf_train, dataset_transf_valid, dataset_transf_test
Ejemplo n.º 19
0
def reweight(ds, priv, unpriv):
    rw = Reweighing(unprivileged_groups=unpriv, privileged_groups=priv)
    ds_transf = rw.fit_transform(ds)
    return ds_transf