def biasf(df_aeq, xtab): """ args: df (dataframe): Recibe el data frame que tiene los features sobre los que queremos medir la disparidad returns: - """ bias = Bias() bdf = bias.get_disparity_predefined_groups( xtab, original_df=df_aeq, ref_groups_dict={'reference_group': 'High'}, alpha=0.05, check_significance=True, mask_significance=True) ## Storing metadata aq_metadata["value_k"] = list(bdf["k"])[0] disparities = bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(bdf)].round(2) majority_bdf = bias.get_disparity_major_group(xtab, original_df=df_aeq) disparities_majority = majority_bdf[ ['attribute_name', 'attribute_value'] + bias.list_disparities(majority_bdf)].round(2) min_bdf = bias.get_disparity_min_metric(xtab, original_df=df_aeq) disparities_min = min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) return bdf, disparities, disparities_majority, disparities_min, bias
def get_bias_metrics(data): bias = Bias() group = Group() old_columns = ['predictions', 'loan_status', 'forty_plus_indicator'] new_columns = ['score', 'label_value', 'forty_plus_indicator'] scored_data = data.loc[:, old_columns] renamer = dict(zip(scored_data.columns, new_columns)) scored_data = scored_data.rename(columns = renamer) data_processed, _ = preprocess_input_df(scored_data) xtab, _ = group.get_crosstabs(data_processed) attribute_columns = ['attribute_name', 'attribute_value'] absolute_metrics = group.list_absolute_metrics(xtab) absolute_metrics_df = xtab[attribute_columns + absolute_metrics].round(2) bias_df = bias.get_disparity_predefined_groups( xtab, original_df=data_processed, ref_groups_dict={'forty_plus_indicator': 'Under Forty'}, alpha=0.05, mask_significance=True ) calculated_disparities = bias.list_disparities(bias_df) disparity_metrics_df = bias_df[attribute_columns + calculated_disparities] abs_metrics = absolute_metrics_df.where(pd.notnull(absolute_metrics_df), None).to_dict(orient='records') disp_metrics = disparity_metrics_df.where(pd.notnull(disparity_metrics_df), None).to_dict(orient='records') return dict(absolute_metrics = abs_metrics, disparity_metrics = disp_metrics)
def metrics(data): data = pd.DataFrame(data) # To measure Bias towards gender, filter DataFrame # to "score", "label_value" (ground truth), and # "gender" (protected attribute) data_scored = data[["score", "label_value", "gender"]] # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Group Metrics g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Absolute metrics, such as 'tpr', 'tnr','precision', etc. absolute_metrics = g.list_absolute_metrics(xtab) # DataFrame of calculated absolute metrics for each sample population group absolute_metrics_df = xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2) # For example: """ attribute_name attribute_value tpr tnr ... precision 0 gender female 0.60 0.88 ... 0.75 1 gender male 0.49 0.90 ... 0.64 """ # Bias Metrics b = Bias() # Disparities calculated in relation gender for "male" and "female" bias_df = b.get_disparity_predefined_groups( xtab, original_df=data_scored_processed, ref_groups_dict={'gender': 'male'}, alpha=0.05, mask_significance=True) # Disparity metrics added to bias DataFrame calculated_disparities = b.list_disparities(bias_df) disparity_metrics_df = bias_df[['attribute_name', 'attribute_value'] + calculated_disparities] # For example: """ attribute_name attribute_value ppr_disparity precision_disparity 0 gender female 0.714286 1.41791 1 gender male 1.000000 1.000000 """ output_metrics_df = disparity_metrics_df # or absolute_metrics_df # Output a JSON object of calculated metrics yield output_metrics_df.to_dict(orient="records")
def bias(df): """ Function to print bias metrics. :param df: Aequitas-compliant dataframe. """ print("Módulo de Sesgo") print("-"*30) bias_ = Bias() g = Group() xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) bdf = bias_.get_disparity_predefined_groups(xtab, original_df = df, ref_groups_dict = {'delegacion': 'IZTAPALAPA'}, alpha=0.05) print("Disparities:") print(bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(bdf)].round(2)) print("Minority Analysis:") min_bdf = bias_.get_disparity_min_metric(xtab, original_df=df) print(min_bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(min_bdf)].round(2)) print("Majority Analysis:") majority_bdf = bias_.get_disparity_major_group(xtab, original_df=df) print(majority_bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(majority_bdf)].round(2))
def tabla_metrica_sesgo(data, attr_ref): # Calculamos las métricas de grupo g = Group() xtab, _ = g.get_crosstabs(data) # Calculamos las metricas de sesgo b = Bias() # Establecemos los atributos de referencia bdf = b.get_disparity_predefined_groups(xtab, original_df=data, ref_groups_dict=attr_ref, alpha=0.05, mask_significance=True) calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) tabla_sesgo = bdf[['attribute_name', 'attribute_value'] + calculated_disparities + disparity_significance] return tabla_sesgo
def aequitas_bias(df, score_column, label_column, protected_class, reference_group): # To measure Bias towards protected_class, filter DataFrame # to score, label (ground truth), and protected class data_scored = df[ [ score_column, label_column, protected_class, ] ] data_scored = data_scored.rename(columns={label_column: "label_value"}) # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Bias Metrics b = Bias() g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Disparities calculated in relation <protected_class> for class groups bias_df = b.get_disparity_predefined_groups( xtab, original_df=data_scored_processed, ref_groups_dict={protected_class: reference_group}, alpha=0.05, mask_significance=True, ) # Disparity metrics added to bias DataFrame calculated_disparities = b.list_disparities(bias_df) disparity_metrics_df = bias_df[ ["attribute_name", "attribute_value"] + calculated_disparities ] # For example: """ attribute_name attribute_value ppr_disparity precision_disparity 0 gender female 0.714286 1.41791 1 gender male 1.000000 1.000000 """ return disparity_metrics_df
def bias(df_aeq, xtab): """ args: df (dataframe): Recibe el data frame que tiene los features sobre los que queremos medir la disparidad returns: - """ bias = Bias() bdf = bias.get_disparity_predefined_groups( xtab, original_df=df_aeq, ref_groups_dict={'delegacion_inicio': 'IZTAPALAPA'}, alpha=0.05, check_significance=True, mask_significance=True) bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(bdf)].round(2)
def get_bias_metrics(data): # To measure Bias towards gender, filter DataFrame # to "score", "label_value" (ground truth), and # "gender" (protected attribute) data_scored = data[["score", "label_value", "gender"]] # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Group Metrics g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Absolute metrics, such as 'tpr', 'tnr','precision', etc. absolute_metrics = g.list_absolute_metrics(xtab) # DataFrame of calculated absolute metrics for each sample population group absolute_metrics_df = xtab[["attribute_name", "attribute_value"] + absolute_metrics].round(2) # Bias Metrics b = Bias() # Disparities calculated in relation gender for "male" and "female" bias_df = b.get_disparity_predefined_groups( xtab, original_df=data_scored_processed, ref_groups_dict={"gender": "male"}, alpha=0.05, mask_significance=True, ) # Disparity metrics added to bias DataFrame calculated_disparities = b.list_disparities(bias_df) disparity_metrics_df = bias_df[["attribute_name", "attribute_value"] + calculated_disparities] output_metrics_df = disparity_metrics_df # or absolute_metrics_df # Output a JSON object of calculated metrics return output_metrics_df.to_dict(orient="records")
def get_fpr_disparity(df, protected_attribute): g = Group() xtab, _ = g.get_crosstabs(df) df_count = df[['distance', 'score']].groupby(['distance']).agg(['count']) get_largest_cat = df_count[('score')].sort_values(['count'], ascending = False).index[0] b = Bias() bdf = b.get_disparity_predefined_groups(xtab, original_df=df, ref_groups_dict={'originwac':df[:1].originwac[0],'distance':get_largest_cat}, alpha=0.05, mask_significance=True) calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) majority_bdf = b.get_disparity_major_group(xtab, original_df=df, mask_significance=True) results = majority_bdf[['attribute_name', 'attribute_value'] + \ calculated_disparities + disparity_significance] res_distance = results[results.attribute_name ==protected_attribute] values= res_distance.attribute_value.tolist() disparity = res_distance.fpr_disparity.tolist() bias_result = values + disparity return bias_result
def fairness(df): """ Genera todo el módulo de Equidad """ print("Módulo de Equidad") print("-"*30) f = Fairness() bias_ = Bias() g = Group() xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) bdf = bias_.get_disparity_predefined_groups(xtab, original_df = df, ref_groups_dict = {'delegacion': 'IZTAPALAPA'}, alpha=0.05) fdf = f.get_group_value_fairness(bdf) parity_determinations = f.list_parities(fdf) print("Imprimiendo tabla de métricas (conteos en frecuencias):") print(fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias_.list_disparities(fdf) + parity_determinations].round(2)) print("Impriendo métricas generales") gof = f.get_overall_fairness(fdf) print(gof) print("Aequitas analysis completed.") print("-"*30)
def get_bias_metrics(data): bias = Bias() group = Group() old_columns = ['predictions', 'loan_status', 'forty_plus_indicator'] new_columns = ['score', 'label_value', 'forty_plus_indicator'] scored_data = data.loc[:, old_columns] renamer = dict(zip(scored_data.columns, new_columns)) scored_data = scored_data.rename(columns=renamer) data_processed, _ = preprocess_input_df(scored_data) xtab, _ = group.get_crosstabs(data_processed) attribute_columns = ['attribute_name', 'attribute_value'] absolute_metrics = group.list_absolute_metrics(xtab) absolute_metrics_df = xtab[attribute_columns + absolute_metrics].round(2) bias_df = bias.get_disparity_predefined_groups( xtab, original_df=data_processed, ref_groups_dict={'forty_plus_indicator': 'Under Forty'}, alpha=0.05, mask_significance=True) calculated_disparities = bias.list_disparities(bias_df) disparity_metrics_df = bias_df[attribute_columns + calculated_disparities] abs_metrics = absolute_metrics_df.where(pd.notnull(absolute_metrics_df), None).to_dict(orient='records') disp_metrics = disparity_metrics_df.where(pd.notnull(disparity_metrics_df), None).to_dict(orient='records') return { "attributeAudited": "forty_plus_indicator", "referenceGroup": "Under Forty", "fairnessThreshold": "80%", "fairnessMeasures": [{ "label": "Predicted Positive Group Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['pprev_disparity'] }, { "label": "Predicted Positive Rate Parity", "result": "Failed", "group": "Over Forty", "disparity": disp_metrics[0]['ppr_disparity'] }, { "label": "Proportional Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['precision_disparity'] }, { "label": "False Positive Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fpr_disparity'] }, { "label": "False Discovery Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fdr_disparity'] }, { "label": "False Negative Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fnr_disparity'] }, { "label": "False Omission Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['for_disparity'] }] }
p = aqp.plot_group_metric_all(xtab, metrics=['ppr', 'pprev', 'fnr', 'fpr'], ncols=4) # Calculamos las metricas de sesgo b = Bias() # Establecemos los atributos de referencia bdf = b.get_disparity_predefined_groups(xtab, original_df=dataset, ref_groups_dict={ 'race': 'White', 'sex': 'Male' }, alpha=0.05, mask_significance=True) calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) # Mostramos la tabla de metricas de sesgo print(bdf[['attribute_name', 'attribute_value'] + calculated_disparities + disparity_significance]) # Plots de disparidad #aqp.plot_disparity(bdf, group_metric='fpr_disparity', attribute_name='race', significance_alpha=0.05) #j = aqp.plot_disparity_all(bdf, metrics=['precision_disparity', 'fpr_disparity'], attributes=['age_cat'], significance_alpha=0.05) # Definimos las medidas de equidad a partir de la tabla de metricas de sesgo f = Fairness() # Establecemos el valor del umbral con la variable tau fdf = f.get_group_value_fairness(bdf, tau=0.8) #parity_detrminations = f.list_parities(fdf) # Tabla con si se cumplen las medidas de equidad para cada atributo
def run_aequitas(predictions_data_path): ''' Check for False negative rate, chances of certain group missing out on assistance using aequitas toolkit The functions transform the data to make it aequitas complaint and checks for series of bias and fairness metrics Input: model prediction path for the selected model (unzip the selected file to run) Output: plots saved in charts folder ''' best_model_pred = pd.read_csv(predictions_data_path) # Transform data for aquetias module compliance aqc = [ 'Other', 'White', 'African American', 'Asian', 'Hispanic', 'American Indian' ] aqcol = [ 'White alone_scale', 'Black/AfAmer alone_scale', 'AmInd/Alaskn alone_scale', 'Asian alone_scale', 'HI alone_scale', 'Some other race alone_scale', 'Hispanic or Latino_scale' ] display(aqcol) aqcol_label = [ 'no_renew_nextpd', 'pred_class_10%', 'Median household income (1999 dollars)_scale' ] + aqcol aqus = best_model_pred[aqcol_label] print('Creating classes for racial and income distribution', '\n') # Convert to binary bin_var = [ 'no_renew_nextpd', 'pred_class_10%', ] for var in bin_var: aqus[var] = np.where(aqus[var] == True, 1, 0) # Rename aqus.rename(columns={ 'no_renew_nextpd': 'label_value', 'pred_class_10%': 'score' }, inplace=True) print('Define majority rule defined on relative proportion of the class', '\n') aqus['race'] = aqus[aqcol].idxmax(axis=1) # Use quantile income distribution aqus['income'] = pd.qcut( aqus['Median household income (1999 dollars)_scale'], 3, labels=["rich", "median", "poor"]) # Final form aqus.drop(aqcol, axis=1, inplace=True) aqus.drop(['Median household income (1999 dollars)_scale'], axis=1, inplace=True) aq = aqus.reset_index() aq.rename(columns={'index': 'entity_id'}, inplace=True) aq['race'] = aq['race'].replace({ 'Some other race alone_scale': 'Other', 'White alone_scale': 'White', 'Black/AfAmer alone_scale': 'African American', 'Asian alone_scale': 'Asian', 'HI alone_scale': 'Hispanic', 'AmInd/Alaskn alone_scale': 'American Indian' }) # Consolidate types aq['income'] = aq['income'].astype(object) aq['entity_id'] = aq['entity_id'].astype(object) aq['score'] = aq['score'].astype(object) aq['label_value'] = aq['label_value'].astype(object) # Distribuion of categories aq_palette = sns.diverging_palette(225, 35, n=2) by_race = sns.countplot(x="race", data=aq[aq.race.isin(aqc)]) by_race.set_xticklabels(by_race.get_xticklabels(), rotation=40, ha="right") plt.savefig('charts/Racial distribution in data.png') # Primary distribuion against score aq_palette = sns.diverging_palette(225, 35, n=2) by_race = sns.countplot(x="race", hue="score", data=aq[aq.race.isin(aqc)], palette=aq_palette) by_race.set_xticklabels(by_race.get_xticklabels(), rotation=40, ha="right") # Race plt.savefig('charts/race_score.png') # Income by_inc = sns.countplot(x="income", hue="score", data=aq, palette=aq_palette) plt.savefig('charts/income_score.png') # Set Group g = Group() xtab, _ = g.get_crosstabs(aq) # False Negative Rates aqp = Plot() fnr = aqp.plot_group_metric(xtab, 'fnr', min_group_size=0.05) p = aqp.plot_group_metric_all(xtab, metrics=['ppr', 'pprev', 'fnr', 'fpr'], ncols=4) p.savefig('charts/eth_metrics.png') # Bias with respect to white rich category b = Bias() bdf = b.get_disparity_predefined_groups(xtab, original_df=aq, ref_groups_dict={ 'race': 'White', 'income': 'rich' }, alpha=0.05, mask_significance=True) bdf.style calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) aqp.plot_disparity(bdf, group_metric='fpr_disparity', attribute_name='race', significance_alpha=0.05) plt.savefig('charts/disparity.png') # Fairness hbdf = b.get_disparity_predefined_groups(xtab, original_df=aq, ref_groups_dict={ 'race': 'African American', 'income': 'poor' }, alpha=0.05, mask_significance=False) majority_bdf = b.get_disparity_major_group(xtab, original_df=aq, mask_significance=True) min_metric_bdf = b.get_disparity_min_metric(df=xtab, original_df=aq) f = Fairness() fdf = f.get_group_value_fairness(bdf) parity_detrminations = f.list_parities(fdf) gaf = f.get_group_attribute_fairness(fdf) gof = f.get_overall_fairness(fdf) z = aqp.plot_fairness_group(fdf, group_metric='ppr') plt.savefig('charts/fairness_overall.png') # Checking for False Omission Rate and False Negative Rates fg = aqp.plot_fairness_group_all(fdf, metrics=['for', 'fnr'], ncols=2) fg.savefig('charts/fairness_metrics.png') return None
def fun_bias_fair(a_zip, a_type, fea_eng, model): X = fea_eng.drop([ 'aka_name', 'facility_type', 'address', 'inspection_date', 'inspection_type', 'violations', 'results', 'pass' ], axis=1) y_pred = model.predict(X) xt = pd.DataFrame([ fea_eng['zip'].astype(float), fea_eng['facility_type'], fea_eng['pass'], y_pred ]).transpose() a_zip['zip'] = a_zip['zip'].astype(float) compas = pd.merge(left=xt, right=a_zip, how='left', left_on='zip', right_on='zip') compas = pd.merge(left=compas, right=a_type, how='left', left_on='facility_type', right_on='facility_type') compas = compas.rename(columns={ 'Unnamed 0': 'score', 'pass': '******' }) compas.pop('zip') compas.pop('facility_type') compas['zone'] = compas['zone'].astype(str) compas['score'] = compas['score'].astype(int) compas['label_value'] = compas['label_value'].astype(int) from aequitas.group import Group from aequitas.bias import Bias from aequitas.fairness import Fairness #Group g = Group() xtab, attrbs = g.get_crosstabs(compas) absolute_metrics = g.list_absolute_metrics(xtab) xtab[[col for col in xtab.columns if col not in absolute_metrics]] group_df = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) abs_gpo = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) #Bias bias = Bias() bdf = bias.get_disparity_predefined_groups(xtab, original_df=compas, ref_groups_dict={ 'zone': 'West', 'facility_group': 'grocery' }, alpha=0.05) # View disparity metrics added to dataframe bias_bdf = bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(bdf)].round(2) majority_bdf = bias.get_disparity_major_group(xtab, original_df=compas) bias_maj_bdf = majority_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(majority_bdf)].round(2) min_bdf = bias.get_disparity_min_metric(xtab, original_df=compas) bias_min_bdf = min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) #Fairness fair = Fairness() fdf = fair.get_group_value_fairness(bdf) parity_determinations = fair.list_parities(fdf) fair_fdf = fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias.list_disparities(fdf) + parity_determinations].round(2) gaf = fair.get_group_attribute_fairness(fdf) fairness_df = fdf.copy() gof = fair.get_overall_fairness(fdf) tab_bias_fair = fair_fdf[[ 'attribute_name', 'attribute_value', 'for', 'fnr', 'for_disparity', 'fnr_disparity', 'FOR Parity', 'FNR Parity' ]] tab_bias_fair.rename(columns={ 'attribute_value': 'group_name', 'FOR Parity': 'for_parity', 'FNR Parity': 'fnr_parity', 'for': 'for_' }, inplace=True) print(tab_bias_fair) return tab_bias_fair