def __init__(self, step_name, step, df, sensitive_att, target_col, input_score=True, clf_threshold=0.5): """ :param step_name: str, name of the current input step. :param step: object of the initialized class. :param df: pandas dataframe, stores the data. :param sensitive_att: str, the name of a sensitive attribute. :param target_col: str, the name of the target attribute. :param input_score: boolean, represent whether the post-processor takes predicted score as input. Default is True. :param clf_threshold: float in [0, 1], represents the threshold to categorize class labels from predicted scores. """ if "pred_"+target_col not in df.columns: print("Require the predictions for ",target_col, " existing in the data!") raise ValueError super().__init__(step_name=step_name, df=df, sensitive_att=sensitive_att, target_col=target_col) # assume the data set has been encoded to numerical values, # intitialize a BinaryLabelDataset from AIF 360 aif_true_df = BinaryLabelDataset(df=df.drop(columns=["pred_"+target_col]), label_names=[target_col], protected_attribute_names=[sensitive_att]) aif_pred_df = aif_true_df.copy() if input_score: aif_pred_df.scores = df["pred_"+target_col] else: aif_pred_df.labels = np.array([int(x >= clf_threshold) for x in df["pred_"+target_col]]) self.input_score = input_score self.step = step.fit(aif_true_df, aif_pred_df) self.clf_threshold = clf_threshold
def test_generalized_entropy_index(): data = np.array([[0, 1], [0, 0], [1, 0], [1, 1], [1, 0], [1, 0], [2, 1], [2, 0], [2, 1], [2, 1]]) pred = data.copy() pred[[3, 9], -1] = 0 pred[[4, 5], -1] = 1 df = pd.DataFrame(data, columns=['feat', 'label']) df2 = pd.DataFrame(pred, columns=['feat', 'label']) bld = BinaryLabelDataset(df=df, label_names=['label'], protected_attribute_names=['feat']) bld2 = BinaryLabelDataset(df=df2, label_names=['label'], protected_attribute_names=['feat']) cm = ClassificationMetric(bld, bld2) assert cm.generalized_entropy_index() == 0.2 pred = data.copy() pred[:, -1] = np.array([0, 1, 1, 0, 0, 0, 0, 1, 1, 1]) df2 = pd.DataFrame(pred, columns=['feat', 'label']) bld2 = BinaryLabelDataset(df=df2, label_names=['label'], protected_attribute_names=['feat']) cm = ClassificationMetric(bld, bld2) assert cm.generalized_entropy_index() == 0.3
def __init__(self, data_set, index_train, index_validate, sensitive_variable): self.sensitive_variable_string = list( data_set.columns)[sensitive_variable] self.s_train = data_set.iloc[:index_train, sensitive_variable] self.s_validate = data_set.iloc[index_train:index_validate, sensitive_variable] self.s_test = data_set.iloc[index_validate:, sensitive_variable] self.y_train = data_set.iloc[:index_train, -1] self.y_validate = data_set.iloc[index_train:index_validate, -1] self.y_test = data_set.iloc[index_validate:, -1] self.train = BinaryLabelDataset( df=data_set.iloc[:index_train, :], label_names=['label'], protected_attribute_names=[self.sensitive_variable_string], favorable_label=1, unfavorable_label=0) self.validate = BinaryLabelDataset( df=data_set.iloc[index_train:index_validate, :], label_names=['label'], protected_attribute_names=[self.sensitive_variable_string], favorable_label=1, unfavorable_label=0) self.test = BinaryLabelDataset( df=data_set.iloc[index_validate:, :], label_names=['label'], protected_attribute_names=[self.sensitive_variable_string], favorable_label=1, unfavorable_label=0)
def test_between_group(): data = np.array([[0, 0, 1], [0, 1, 0], [1, 1, 0], [1, 1, 1], [1, 0, 0], [1, 0, 0]]) pred = data.copy() pred[[0, 3], -1] = 0 pred[[4, 5], -1] = 1 df = pd.DataFrame(data, columns=['feat', 'feat2', 'label']) df2 = pd.DataFrame(pred, columns=['feat', 'feat2', 'label']) bld = BinaryLabelDataset(df=df, label_names=['label'], protected_attribute_names=['feat', 'feat2']) bld2 = BinaryLabelDataset(df=df2, label_names=['label'], protected_attribute_names=['feat', 'feat2']) cm = ClassificationMetric(bld, bld2, unprivileged_groups=[{ 'feat': 0 }], privileged_groups=[{ 'feat': 1 }]) b = np.array([0.5, 0.5, 1.25, 1.25, 1.25, 1.25]) assert cm.between_group_generalized_entropy_index( ) == 1 / 12 * np.sum(b**2 - 1)
def dataset_from_matrix(x, dataset): df = pd.DataFrame(data=x, columns=dataset.feature_names + dataset.label_names) dataset_ = BinaryLabelDataset(df=df, label_names=dataset.label_names, protected_attribute_names=dataset.protected_attribute_names) dataset_ = dataset.align_datasets(dataset_) #dataset_.favorable_label = dataset.favorable_label dataset_.validate_dataset() return dataset_
def equal_ops_values(random_data, predicted_data, target_variable, protected_variable, unprivileged_input): random_data['Pred'] = np.random.binomial(1, .5, 1000) dataset = BinaryLabelDataset(df=random_data, label_names=[target_variable], protected_attribute_names=[protected_variable]) classified_dataset = BinaryLabelDataset(df=predicted_data, label_names=[target_variable], protected_attribute_names=[protected_variable]) privileged_group = [] for v in predicted_data[protected_variable].unique()[predicted_data[protected_variable].unique() != unprivileged_input]: privileged_group.append({protected_variable: v}) unprivileged_group = [{protected_variable: unprivileged_input}] #female=0 metric = ClassificationMetric(dataset, classified_dataset, unprivileged_group, privileged_group) return abs(metric.equal_opportunity_difference())
def reweigh_and_predict(df1, df2): # concatenate the data and clean it df = pandas.concat([df1, df2]) ntrain = 5410 #len(df1) ntest = 1804 #len(df2) df1 = df #df = pandas.read_csv("compas.csv") df = pandas.get_dummies(df, prefix=['sex', 'race', 'c_charge_degree'], drop_first=True) df = df.rename( columns={ 'race_Non-White': 'race', 'sex_Male': 'sex', 'c_charge_degree_M': 'charge_degree' }) # set up the BinaryLabelDataset label_names = ['two_year_recid'] protected_attribute_names = ['race'] train_data = df.head(ntrain) test_data = df.tail(ntest) train_data = BinaryLabelDataset( df=train_data, label_names=label_names, protected_attribute_names=protected_attribute_names) test_data = BinaryLabelDataset( df=test_data, label_names=label_names, protected_attribute_names=protected_attribute_names) privileged_groups = [{'race': 1}] unprivileged_groups = [{'race': 0}] RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW.fit(train_data) dataset_transf_train = RW.transform(train_data) scale_transf = StandardScaler() X_train = scale_transf.fit_transform(dataset_transf_train.features) y_train = dataset_transf_train.labels.ravel() lmod = LogisticRegression() lmod.fit(X_train, y_train, sample_weight=dataset_transf_train.instance_weights) y_train_pred = lmod.predict(X_train) dataset_transf_test_pred = test_data X_test = scale_transf.fit_transform(dataset_transf_test_pred.features) y_test = dataset_transf_test_pred.labels dataset_transf_test_pred.scores = lmod.predict(X_test) Y_hat = dataset_transf_test_pred.scores return Y_hat
def create_df_aif(df_train, df_test, label, protected_attribute, metadata): df_train_aif = BinaryLabelDataset(df = df_train, label_names=[label], protected_attribute_names = [protected_attribute], instance_weights_name=None, unprivileged_protected_attributes=[], privileged_protected_attributes=[], metadata=metadata) df_test_aif = BinaryLabelDataset(df = df_test, label_names=[label], protected_attribute_names = [protected_attribute], instance_weights_name=None, unprivileged_protected_attributes=[], privileged_protected_attributes=[], metadata=metadata) return df_train_aif, df_test_aif
def _preprocess_data( data, protected_attribute_name, protected_attribute_index, label_name, required_fairness ): from pandas import DataFrame from aif360.datasets import BinaryLabelDataset dataset = BinaryLabelDataset( df=DataFrame(data), protected_attribute_names={protected_attribute_name}, label_names={label_name}, favorable_label=2, unfavorable_label=1, ) train, test = dataset.split([0.8]) from aif360.algorithms.inprocessing import AdversarialDebiasing sess = tf.compat.v1.Session() debiaser = AdversarialDebiasing( unprivileged_groups=({protected_attribute_name: 0},), privileged_groups=({protected_attribute_name: 1},), scope_name="debiaser", debias=True, sess=sess, ) debiaser.fit(train) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(class_weight="balanced") X_tr = np.delete(train.features, protected_attribute_index, axis=1) y_tr = train.labels.ravel() model.fit(X_tr, y_tr) test_pred = test.copy(deepcopy=True) test_pred.scores = model.predict(np.delete(debiaser.predict(test).features, protected_attribute_index, axis=1)) accuracy = np.sum(np.equal(test.scores, test_pred.scores)) from aif360.metrics import ClassificationMetric disparate_impact = ClassificationMetric( test, test_pred, unprivileged_groups=({protected_attribute_name: 0},), privileged_groups=({protected_attribute_name: 1},), ).disparate_impact() print(f"Accuracy: {accuracy}") print(f"Disparate impact: {disparate_impact}") if disparate_impact > float(required_fairness): raise ValueError( f"Too unfair! Disparate impact was {disparate_impact} but must be less than {required_fairness}" )
def test(dataset, model, x_test, thresh_arr, unprivileged_groups, privileged_groups): bld = BinaryLabelDataset(df=dataset, label_names=['labels'], protected_attribute_names=['age']) if np.isin(k, model_AIF): y_val_pred_prob = model.predict_proba(bld) else: y_val_pred_prob, A_val_pred_prob = model.predict_proba(x_test) metric_arrs = np.empty([0, 8]) for thresh in thresh_arr: if np.isin(k, model_AIF): y_val_pred = (y_val_pred_prob > thresh).astype(np.float64) else: y_val_pred = (y_val_pred_prob.numpy() > thresh).astype(np.float64) metric_arrs = np.append(metric_arrs, roc_auc_score(y_test, y_val_pred_prob)) if np.isin(k, model_AIF): metric_arrs = np.append(metric_arrs, 0) else: metric_arrs = np.append(metric_arrs, roc_auc_score(A_test, A_val_pred_prob)) dataset_pred = dataset.copy() dataset_pred.labels = y_val_pred bld2 = BinaryLabelDataset(df=dataset_pred, label_names=['labels'], protected_attribute_names=['age']) metric = ClassificationMetric(bld, bld2, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) metric_arrs = np.append( metric_arrs, ((metric.true_positive_rate() + metric.true_negative_rate()) / 2)) metric_arrs = np.append(metric_arrs, metric.average_odds_difference()) metric_arrs = np.append(metric_arrs, metric.disparate_impact()) metric_arrs = np.append(metric_arrs, metric.statistical_parity_difference()) metric_arrs = np.append(metric_arrs, metric.equal_opportunity_difference()) metric_arrs = np.append(metric_arrs, metric.theil_index()) return metric_arrs
def reweigh_and_predict(df1, df2): # concatenate the data and clean it df = pandas.concat([df1, df2]) ntrain = len(df1) ntest = len(df2) #df = pandas.read_csv("UCIAdult.csv") df = pandas.get_dummies(df, prefix = ['income', 'sex', 'native_country', 'marital_status',\ 'workclass', 'occupation'], drop_first = True) df = df.rename(columns = {'income_>50K':'income', 'sex_Female':'sex', 'native_country_United-States':'native_country',\ 'marital_status_Not-Married':'marital_status'}) #df = df.drop(columns = ['Unnamed: 0']) # set up the BinaryLabelDataset label_names = ['income'] protected_attribute_names = ['sex'] train_data = df.head(ntrain) test_data = df.tail(ntest) train_data = BinaryLabelDataset( df=train_data, label_names=label_names, protected_attribute_names=protected_attribute_names) test_data = BinaryLabelDataset( df=test_data, label_names=label_names, protected_attribute_names=protected_attribute_names) privileged_groups = [{'sex': 1}] unprivileged_groups = [{'sex': 0}] RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW.fit(train_data) dataset_transf_train = RW.transform(train_data) scale_transf = StandardScaler() X_train = scale_transf.fit_transform(dataset_transf_train.features) y_train = dataset_transf_train.labels.ravel() lmod = LogisticRegression() lmod.fit(X_train, y_train, sample_weight=dataset_transf_train.instance_weights) y_train_pred = lmod.predict(X_train) dataset_transf_test_pred = test_data X_test = scale_transf.fit_transform(dataset_transf_test_pred.features) y_test = dataset_transf_test_pred.labels dataset_transf_test_pred.scores = lmod.predict(X_test) Y_hat = dataset_transf_test_pred.scores return Y_hat
def odds_diff(random_data, predicted_data, target_variable, protected_variable, unprivileged_input): random_data['Pred'] = np.random.binomial(1, .5, 1000) dataset = BinaryLabelDataset(df=random_data, label_names=[target_variable], protected_attribute_names=[protected_variable]) classified_dataset = BinaryLabelDataset(df=predicted_data, label_names=[target_variable], protected_attribute_names=[protected_variable]) privileged_group = [] for v in predicted_data[protected_variable].unique()[predicted_data[protected_variable].unique() != unprivileged_input]: privileged_group.append({protected_variable: v}) unprivileged_group = [{protected_variable: unprivileged_input}] #female=0 metric = ClassificationMetric(dataset, classified_dataset, unprivileged_group, privileged_group) print(metric.average_abs_odds_difference()) if abs(metric.average_abs_odds_difference().round(3)) < 0.2: print('The algorithm can be considered to be not biased') else: print('There is a potential bias')
def __init__(self, *args, **kwargs): # remove arguments for sim_args constructor sim_args_names = ['mutable_features', 'domains', 'cost_fns', 'discrete'] sim_args = {k: kwargs.pop(k, None) for k in sim_args_names} self.means = kwargs.pop('means', [45,60]) self.N = kwargs.pop('N', 1000) self.threshold = kwargs.pop('threshold', 55) self.human_readable_labels ={} df = self._generateData(means=self.means, N=self.N, threshold=self.threshold) kwargs = {'df':df, 'label_names':['y'], 'protected_attribute_names':['group']} BinaryLabelDataset.__init__(self, **kwargs) SimMixin.__init__(self, **sim_args)
def get_transformed_data(dataset='data/simulated_data.csv', protected_attribute='group'): sample_data = pd.read_csv(dataset, header=0) pre_transform = BinaryLabelDataset( 1.0, 0.0, df=sample_data, label_names=['outcome'], protected_attribute_names=[protected_attribute]) RW = Reweighing(unprivileged_groups=[{ 'group': 0 }], privileged_groups=[{ 'group': 1 }]) # RW.fit(pre_transform) post_transform = RW.fit_transform(pre_transform) ds = post_transform.convert_to_dataframe()[0] X = ds.drop('outcome', axis=1) y = ds['outcome'] return { 'simulated_data': { 'data': X.values, 'labels': y.values, 'participant_ids': np.arange(0, len(ds)), 'feature_names': np.array([f for f in ds if f not in ['outcome']]) } }
def create_binary_dataset_sb(): """This will create a binary dataset from the csv with a set salary as the threshold for later predictions. Input - A numeric salary to be set as the threshold Out - A AIF360 binary dataset with one-hot encoded categorical columns """ data = pd.read_csv('../company_x_sb.csv', index_col='employee_id') data_with_label = data.copy() data_with_label['sex'] = data_with_label['sex'].transform(lambda x: x == 'M').astype(int) std_data = StandardDataset(df=data_with_label, label_name='new_signing_bonus', favorable_classes =[1], protected_attribute_names=['sex'], privileged_classes=[[1]], categorical_features=['degree_level', 'dept'], features_to_drop=['boss_id']) df_data = std_data.convert_to_dataframe() binary_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_data[0], label_names=['new_signing_bonus'], protected_attribute_names=['sex']) return binary_dataset
def get_fairness_metrics(): def get_bldm_metrics(): metric_BLDM = BinaryLabelDatasetMetric( dataset, unprivileged_group, privileged_group) return {"Statistical Parity Difference": metric_BLDM.statistical_parity_difference(), "Disparate Impact": metric_BLDM.disparate_impact()} def get_cm_metrics(): df_pred = X.copy() df_pred[df.columns[-1]] = np.expand_dims(ypred_class, axis=1) dataset_pred = BinaryLabelDataset(df=df_pred, label_names=[ 'action_taken_name'], protected_attribute_names=['applicant_sex_name_Female']) metric_CM = ClassificationMetric( dataset, dataset_pred, privileged_groups=privileged_group, unprivileged_groups=unprivileged_group) return { "Equal Opportunity Difference": metric_CM.equal_opportunity_difference(), 'Average Odds Difference': metric_CM.average_odds_difference(), "Accuracy Male": metric_CM.accuracy(privileged=True), "Accuracy Female": metric_CM.accuracy(privileged=False) } dataset = BinaryLabelDataset(df=df, label_names=[ 'action_taken_name'], protected_attribute_names=['applicant_sex_name_Female']) privileged_group = [{'applicant_sex_name_Female': 0}] unprivileged_group = [{'applicant_sex_name_Female': 1}] return {**get_bldm_metrics(), **get_cm_metrics()}
def make_dataset(features, labels=None, scores=None, protected_columns=None, privileged_groups=None, unprivileged_groups=None, favorable_label=None, unfavorable_label=None): df = features.copy() if labels is None: labels = favorable_label df['outcome'] = labels if scores is not None: scores_names = 'scores' df[scores_names] = scores else: scores_names = [] dataset = BinaryLabelDataset( df=df, label_names=['outcome'], scores_names=scores_names, protected_attribute_names=protected_columns, favorable_label=favorable_label, unfavorable_label=unfavorable_label, unprivileged_protected_attributes=unprivileged_groups) return dataset
def get_adult_data(): ''' Preprocess the adult data set by removing some features and put adult data into a BinaryLabelDataset You need to download the adult dataset (both the adult.data and adult.test files) from https://archive.ics.uci.edu/ml/datasets/Adult ''' headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-stataus', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'y'] train = pd.read_csv('adult.data', header = None) test = pd.read_csv('adult.test', header = None) df = pd.concat([train, test], ignore_index=True) df.columns = headers df['y'] = df['y'].replace({' <=50K.': 0, ' >50K.': 1, ' >50K': 1, ' <=50K': 0 }) df = df.drop(df[(df[headers[-2]] == ' ?') | (df[headers[6]] == ' ?')].index) df = pd.get_dummies(df, columns=[headers[1], headers[5], headers[6], headers[7], headers[9], headers[8], 'native-country']) delete_these = ['race_ Amer-Indian-Eskimo','race_ Asian-Pac-Islander','race_ Black','race_ Other', 'sex_ Female'] delete_these += ['native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia'] delete_these += ['fnlwgt', 'education'] df.drop(delete_these, axis=1, inplace=True) return BinaryLabelDataset(df = df, label_names = ['y'], protected_attribute_names = ['sex_ Male', 'race_ White'])
def prepare_dataset(features, labels, protected_attribute, privileged_attribute_values, unprivileged_attribute_values, favorable_label=1., unfavorable_label=0.): """Prepare dataset for computing fairness metrics.""" df = features.copy() df['outcome'] = labels return BinaryLabelDataset( df=df, label_names=['outcome'], scores_names=[], protected_attribute_names=[protected_attribute], privileged_protected_attributes=[ np.array(privileged_attribute_values) ], unprivileged_protected_attributes=[ np.array(unprivileged_attribute_values) ], favorable_label=favorable_label, unfavorable_label=unfavorable_label, )
def apply(self, df): """ :param df: pandas dataframe, stores the data to apply the learned discretizer. :return: pandas dataframe, stores the data after discretize. """ if self.na_mark: df = df.replace({self.na_mark:np.nan}) if self.fair_aware: # fair-preprocessor aif_df = BinaryLabelDataset(df=df, label_names=[self.target_col], protected_attribute_names=[self.sensitive_att]) if self.fit_flag: # fit has been initialized after_aif_df = self.step.transform(aif_df) else: # fit and transform is combined, e.g. DisparateImpactRemover after_aif_df = self.step.fit_transform(aif_df) after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True) if self.weight_flag: preprocessed_weights = after_aif_df.instance_weights else: # regular preprocessor after_df = df.copy() for ai in self.focus_atts: after_df[ai] = self.step[ai].transform(np.array(after_df[ai]).reshape(-1, 1)) if self.weight_flag: # for the preprocessor that updates weights, e.g. Reweighing return after_df, preprocessed_weights else: return after_df
def __init__(self, step_name, df, step=None, focus_atts=[], fit_flag=True, weight_flag=False, sensitive_att=None, target_col=None, fair_aware=False, na_mark=None): """ :param step_name: str, name of the current input step. :param df: pandas dataframe, stores the data. :param step: object of the initialized class. If none, initialize here. :param focus_atts: lisf of str, each str represents the name of a column in above data that will be pre-processed. :param fit_flag: boolean, whether to initialize step object here. :param weight_flag: boolean, whether to output extra sample weight after fair-preprocessor. :param sensitive_att: str, the name of a sensitive attribute. :param target_col: str, the name of the target attribute. :param fair_aware: boolean, whether the preprocessor is fair-aware. Default is False. If true, sensitive_att and target_col can not be null. """ super().__init__(step_name=step_name, df=df, focus_atts=focus_atts, sensitive_att=sensitive_att, target_col=target_col) if len(focus_atts) > 0 and fit_flag: fitted_step = {} for idx, ai in enumerate(focus_atts): fitted_step[ai] = step[ai].fit(np.array(df[ai]).reshape(-1, 1)) self.step = fitted_step elif fair_aware and fit_flag: # for fair-preprocessors aif_df = BinaryLabelDataset(df=df, label_names=[target_col], protected_attribute_names=[sensitive_att]) self.step = step.fit(aif_df) else: if step is not None: self.step = step # address different encoding of missing values if na_mark is not None: self.na_mark = na_mark else: self.na_mark = None self.fair_aware = fair_aware self.fit_flag = fit_flag self.weight_flag = weight_flag
def fairness_IBM(y_pred, Ztr, ytr, verbose=0): from aif360.datasets import BinaryLabelDataset from aif360.metrics import ClassificationMetric assert np.array_equal(np.unique(Ztr), np.array([0, 1])), "Z must contain either 0 or 1" # if len(ytr.shape) == 1: # ytr = np.expand_dims(ytr, -1) Ztr = np.squeeze(Ztr) if verbose: print(ytr.shape) print(Ztr.shape) unprivileged_groups = [{"zs": [0]}] privileged_groups = [{"zs": [1]}] metric_arrs = defaultdict(list) dict_ = {"y_true": ytr, "zs": Ztr} df = pd.DataFrame(dict_) dataset = BinaryLabelDataset(df=df, label_names=["y_true"], protected_attribute_names=["zs"], unprivileged_protected_attributes=[[0]], privileged_protected_attributes=[[1]]) dataset_pred = dataset.copy() dataset_pred.labels = y_pred metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) # metric_arrs['bal_acc'].append((metric.true_positive_rate() # + metric.true_negative_rate()) / 2) metric_arrs["EA"].append( metric.accuracy(privileged=False) - metric.accuracy(privileged=True)) # ASSUMING ALL OTHER METRICS RETURN U - P metric_arrs['EO'].append(metric.average_odds_difference()) # The ideal value of this metric is 1.0 # A value < 1 implies higher benefit for the privileged group # and a value >1 implies a higher metric_arrs['DI'].append(metric.disparate_impact() - 1) metric_arrs['DP'].append(metric.statistical_parity_difference()) metric_arrs['EQ'].append(metric.equal_opportunity_difference()) metric_arrs['TH'].append(metric.between_group_theil_index() * 10) results = pd.DataFrame(metric_arrs) return results
def create_binary(data, target_variable, protected_variable, unprivileged_input): df_aif = BinaryLabelDataset(df=data, label_names=[target_variable], protected_attribute_names=[protected_variable]) privileged_group = [] for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]: privileged_group.append({protected_variable: v}) unprivileged_group = [{protected_variable: unprivileged_input}] #female=0 return BinaryLabelDatasetMetric(df_aif, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
def mean_diff_values(data, target_variable, protected_variable, unprivileged_input): df_aif = BinaryLabelDataset(df=data, label_names=[target_variable], protected_attribute_names=[protected_variable]) privileged_group = [] for v in data[protected_variable].unique()[data[protected_variable].unique() != unprivileged_input]: privileged_group.append({protected_variable: v}) unprivileged_group = [{protected_variable: unprivileged_input}] #female=0 metric_orig = BinaryLabelDatasetMetric(df_aif, unprivileged_group, privileged_group) return abs(metric_orig.mean_difference().round(3))
def fit(self, data, labels, prot): ds = BinaryLabelDataset(df = data, label_names = labels, protected_attribute_names= prot) self.prot = prot x = self.model_reweight.fit_transform(ds) index = x.feature_names.index(prot[0]) x_train = np.delete(x.features,index,1) y_train = x.labels.ravel() self.model.fit(x_train, y_train)
def _make_dataset(data, outcome, protected_columns, privileged_groups, unprivileged_groups, favorable_label, unfavorable_label): df = data.copy() df['outcome'] = data[outcome].values dataset = BinaryLabelDataset(df=df, label_names=['outcome'], protected_attribute_names=protected_columns, favorable_label=favorable_label, unfavorable_label=unfavorable_label, unprivileged_protected_attributes=unprivileged_groups) return dataset
def test_between_all_groups(): data = np.array([[0, 1], [0, 0], [1, 0], [1, 1], [1, 0], [1, 0], [2, 1], [2, 0], [2, 1], [2, 1]]) pred = data.copy() pred[[3, 9], -1] = 0 pred[[4, 5], -1] = 1 df = pd.DataFrame(data, columns=['feat', 'label']) df2 = pd.DataFrame(pred, columns=['feat', 'label']) bld = BinaryLabelDataset(df=df, label_names=['label'], protected_attribute_names=['feat']) bld2 = BinaryLabelDataset(df=df2, label_names=['label'], protected_attribute_names=['feat']) cm = ClassificationMetric(bld, bld2) b = np.array([1, 1, 1.25, 1.25, 1.25, 1.25, 0.75, 0.75, 0.75, 0.75]) assert cm.between_all_groups_generalized_entropy_index( ) == 1 / 20 * np.sum(b**2 - 1)
def fit(self, data, labels, prot): ds = BinaryLabelDataset(df = data, label_names = labels, protected_attribute_names= prot) self.prot = prot x = self.model_reweight.fit_transform(ds) index = x.feature_names.index(prot[0]) x_train = np.delete(x.features,index,1) y_train = x.labels x_train = torch.tensor(x_train).type('torch.FloatTensor') y_train = torch.tensor(y_train).type('torch.FloatTensor') self.model.fit(x_train, y_train)
def apply(self, df): """ :param df: pandas dataframe, stores the data to apply the learned discretizer. :return: pandas dataframe, stores the data after discretize. """ # initialize AIF360 BinaryLabelDataset if self.input_score: # use score prediction to fit model, e.g. RejectOptionClassification, CalibratedEqOddsPostprocessing aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col], scores_names=[self.pred_target_col], protected_attribute_names=[self.sensitive_att]) else: # use label prediction to fit model, e.g. EqOddsPostprocessing df["pred_label_"+self.target_col] = [int(x >= self.clf_threshold) for x in df[self.pred_target_col]] aif_pred_df = BinaryLabelDataset(df=df.drop(columns=[self.pred_target_col]), label_names=["pred_label_"+self.target_col], protected_attribute_names=[self.sensitive_att]) after_aif_df = self.step.predict(aif_pred_df) after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True) after_df[self.pred_target_col] = after_aif_df.labels return after_df
def reweigh_and_predict(df1, df2): label_names = ['Y'] protected_attribute_names = ['A'] df = pandas.concat([df1, df2]) ntrain = len(df1) ntest = len(df2) train_data = df.head(ntrain) test_data = df.tail(ntest) train_data = BinaryLabelDataset(df = train_data, label_names = label_names, protected_attribute_names = protected_attribute_names) test_data = BinaryLabelDataset(df = test_data, label_names = label_names, protected_attribute_names = protected_attribute_names) privileged_groups = [{'A': 0}] unprivileged_groups = [{'A': 1}] RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups) RW.fit(train_data) dataset_transf_train = RW.transform(train_data) scale_transf = StandardScaler() X_train = scale_transf.fit_transform(dataset_transf_train.features) y_train = dataset_transf_train.labels.ravel() lmod = LogisticRegression() lmod.fit(X_train, y_train, sample_weight=dataset_transf_train.instance_weights) y_train_pred = lmod.predict(X_train) dataset_transf_test_pred = test_data X_test = scale_transf.fit_transform(dataset_transf_test_pred.features) y_test = dataset_transf_test_pred.labels dataset_transf_test_pred.scores = lmod.predict_proba(X_test)[:,1:2].ravel() Y_hat = dataset_transf_test_pred.scores return Y_hat