def _fdf(self): if self.fdf is not None: return self.fdf f = Fairness() self.fdf = f.get_group_value_fairness(self._bdf()) return self.fdf
def audit(df, configs, model_id=1, preprocessed=False): """ :param df: :param ref_groups_method: :param model_id: :param configs: :param report: :param preprocessed: :return: """ if not preprocessed: df, attr_cols_input = preprocess_input_df(df) if not configs.attr_cols: configs.attr_cols = attr_cols_input g = Group() print('Welcome to Aequitas-Audit') print('Fairness measures requested:', ','.join(configs.fair_measures_requested)) groups_model, attr_cols = g.get_crosstabs( df, score_thresholds=configs.score_thresholds, model_id=model_id, attr_cols=configs.attr_cols) print('audit: df shape from the crosstabs:', groups_model.shape) b = Bias() # todo move this to the new configs object / the attr_cols now are passed through the configs object... ref_groups_method = configs.ref_groups_method if ref_groups_method == 'predefined' and configs.ref_groups: bias_df = b.get_disparity_predefined_groups(groups_model, configs.ref_groups) elif ref_groups_method == 'majority': bias_df = b.get_disparity_major_group(groups_model) else: bias_df = b.get_disparity_min_metric(groups_model) print('Any NaN?: ', bias_df.isnull().values.any()) print('bias_df shape:', bias_df.shape) f = Fairness(tau=configs.fairness_threshold) print('Fairness Threshold:', configs.fairness_threshold) print('Fairness Measures:', configs.fair_measures_requested) group_value_df = f.get_group_value_fairness( bias_df, fair_measures_requested=configs.fair_measures_requested) group_attribute_df = f.get_group_attribute_fairness( group_value_df, fair_measures_requested=configs.fair_measures_requested) fair_results = f.get_overall_fairness(group_attribute_df) print(fair_results) report = None if configs.report is True: report = audit_report_markdown(configs, group_value_df, f.fair_measures_depend, fair_results) return group_value_df, report
def get_model_fairness(self, level='model'): g = Group() xtab, _ = g.get_crosstabs(self.df) b = Bias() majority_bdf = b.get_disparity_major_group(xtab, original_df=self.df, mask_significance=True) f = Fairness() fdf = f.get_group_value_fairness(majority_bdf) f_res = fdf if level == 'model': f_res = f.get_overall_fairness(fdf) elif level == 'attribute': f_res = f.get_group_attribute_fairness(fdf) return f_res
def tabla_medidas_equidad(data, attr_ref, tau=0.8): # Calculamos las métricas de grupo g = Group() xtab, _ = g.get_crosstabs(data) # Calculamos las metricas de sesgo b = Bias() # Establecemos los atributos de referencia bdf = b.get_disparity_predefined_groups(xtab, original_df=data, ref_groups_dict=attr_ref, alpha=0.05, mask_significance=True) # Definimos las medidas de equidad a partir de la tabla de metricas de sesgo f = Fairness() # Establecemos el valor del umbral con la variable tau fdf = f.get_group_value_fairness(bdf, tau=tau) # Tabla con si se cumplen las medidas de equidad para cada atributo tabla_equidad = f.get_group_attribute_fairness(fdf) return tabla_equidad
def fairness(df): """ Genera todo el módulo de Equidad """ print("Módulo de Equidad") print("-"*30) f = Fairness() bias_ = Bias() g = Group() xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) bdf = bias_.get_disparity_predefined_groups(xtab, original_df = df, ref_groups_dict = {'delegacion': 'IZTAPALAPA'}, alpha=0.05) fdf = f.get_group_value_fairness(bdf) parity_determinations = f.list_parities(fdf) print("Imprimiendo tabla de métricas (conteos en frecuencias):") print(fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias_.list_disparities(fdf) + parity_determinations].round(2)) print("Impriendo métricas generales") gof = f.get_overall_fairness(fdf) print(gof) print("Aequitas analysis completed.") print("-"*30)
def fairnessf(bdf, absolute_metrics, bias): """ args: df (dataframe): Recibe el data frame que tiene los features sobre los que queremos medir la equidad. returns: - """ fair = Fairness() fdf = fair.get_group_value_fairness(bdf) parity_determinations = fair.list_parities(fdf) fairness = fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias.list_disparities(fdf) + parity_determinations].round(2) ## Storing metadata aq_metadata["v_group"] = str(fdf.loc[0, "attribute_value"]) aq_metadata["FOR_p"] = str(fdf.loc[0, "FOR Parity"]) aq_metadata["FNR_p"] = str(fdf.loc[0, "FNR Parity"]) #return df_aeq gaf = fair.get_group_attribute_fairness(fdf) gof = fair.get_overall_fairness(fdf) return fairness, gaf, gof
def audit_file(name, dirname): upload_path = os.path.join(tempfile.gettempdir(), dirname) data_path = os.path.join(upload_path, name + '.csv') if not os.path.exists(data_path): abort(404) try: df = pd.read_csv(data_path) except pd.errors.ParserError: flash('Bad CSV file – could not parse', 'warning') return redirect(url_for('home')) (df, groups) = preprocess_input_df(df) if "submit" not in request.form: subgroups = {col: list(set(df[col])) for col in groups} # set defaults for (key, values) in ( ('race', ('White', 'Caucasian')), ('sex', ('Male',)), ('gender', ('Male',)), ('age_cat', ('25 - 45',)), ('education', ('HS-grad',)), ): if key in subgroups: subgroups[key].sort(key=lambda value: int(value not in values)) supported_fairness_measures = Fairness().get_fairness_measures_supported(df) fairness_measures = [x for x in FAIR_MAP_ORDER if FAIR_MAP[x].issubset(set(supported_fairness_measures))] return render_template('audit.html', categories=groups, subcategories=subgroups, fairness=fairness_measures) rgm = request.form["ref_groups_method"] if rgm == 'predefined': group_variables = request.form.getlist('group_variable1') else: group_variables = request.form.getlist('group_variable2') # check if user forgot to select anything; return all if len(group_variables) == 0: group_variables = groups # remove unwanted cols from df subgroups = {g: request.form[g] for g in group_variables} # majority_groups = request.form.getlist('use_majority_group') raw_fairness_measures = request.form.getlist('fairness_measures') if len(raw_fairness_measures) == 0: fairness_measures = list(Fairness().get_fairness_measures_supported(df)) else: # map selected measures to input fairness_measures = [y for x in raw_fairness_measures for y in FAIR_MAP[x]] try: fv = float(request.form['fairness_pct']) except (KeyError, ValueError): fv = None fp = fv / 100.0 if fv else 0.8 configs = Configs(ref_groups=subgroups, ref_groups_method=rgm, fairness_threshold=fp, fairness_measures=fairness_measures, attr_cols=group_variables) (_gv_df, report) = audit(df, # model_id=1, configs=configs, preprocessed=True) for reportid in itertools.count(1): report_path = os.path.join(upload_path, str(reportid)) if not os.path.exists(report_path): break with open(report_path, 'w') as fd: fd.write(report) return redirect(url_for("report", dirname=dirname, name=name, reportid=reportid))
def _write_audit_to_db(self, model_id, protected_df, predictions_proba, labels, tie_breaker, subset_hash, matrix_type, evaluation_start_time, evaluation_end_time, matrix_uuid): """ Runs the bias audit and saves the result in the bias table. Args: model_id (int) primary key of the model protected_df (pandas.DataFrame) A dataframe with protected group attributes: predictions_proba (np.array) List of prediction probabilities labels (pandas.Series): List of labels tie_breaker: 'best' or 'worst' case tiebreaking rule that the predictions and labels were sorted by subset_hash (str) the hash of the subset, if any, that the evaluation is made on matrix_type (triage.component.catwalk.storage.MatrixType) The type of matrix used evaluation_start_time (pandas._libs.tslibs.timestamps.Timestamp) first as_of_date included in the evaluation period evaluation_end_time (pandas._libs.tslibs.timestamps.Timestamp) last as_of_date included in the evaluation period matrix_uuid: the uuid of the matrix Returns: """ if protected_df.empty: return # to preprocess aequitas requires the following columns: # score, label value, model_id, protected attributes # fill out the protected_df, which just has protected attributes at this point protected_df = protected_df.copy() protected_df['model_id'] = model_id protected_df['score'] = predictions_proba protected_df['label_value'] = labels aequitas_df, attr_cols_input = preprocess_input_df(protected_df) # create group crosstabs g = Group() score_thresholds = {} score_thresholds['rank_abs'] = self.bias_config['thresholds'].get( 'top_n', []) # convert 0-100 percentile to 0-1 that Aequitas expects score_thresholds['rank_pct'] = [ value / 100.0 for value in self.bias_config['thresholds'].get('percentiles', []) ] groups_model, attr_cols = g.get_crosstabs( aequitas_df, score_thresholds=score_thresholds, attr_cols=attr_cols_input) # analyze bias from reference groups bias = Bias() ref_groups_method = self.bias_config.get('ref_groups_method', None) if ref_groups_method == 'predefined' and self.bias_config['ref_groups']: bias_df = bias.get_disparity_predefined_groups( groups_model, aequitas_df, self.bias_config['ref_groups']) elif ref_groups_method == 'majority': bias_df = bias.get_disparity_major_group(groups_model, aequitas_df) else: bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df) # analyze fairness for each group f = Fairness(tau=0.8) # the default fairness threshold is 0.8 group_value_df = f.get_group_value_fairness(bias_df) group_value_df['subset_hash'] = subset_hash group_value_df['tie_breaker'] = tie_breaker group_value_df['evaluation_start_time'] = evaluation_start_time group_value_df['evaluation_end_time'] = evaluation_end_time group_value_df['matrix_uuid'] = matrix_uuid group_value_df = group_value_df.rename( index=str, columns={"score_threshold": "parameter"}) if group_value_df.empty: raise ValueError(f""" Bias audit: aequitas_audit() failed. Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash} and matrix_type = {matrix_type}""") with scoped_session(self.db_engine) as session: for index, row in group_value_df.iterrows(): session.query(matrix_type.aequitas_obj).filter_by( model_id=row['model_id'], evaluation_start_time=row['evaluation_start_time'], evaluation_end_time=row['evaluation_end_time'], subset_hash=row['subset_hash'], parameter=row['parameter'], tie_breaker=row['tie_breaker'], matrix_uuid=row['matrix_uuid'], attribute_name=row['attribute_name'], attribute_value=row['attribute_value']).delete() session.bulk_insert_mappings( matrix_type.aequitas_obj, group_value_df.to_dict(orient="records"))
def execeute(self): model = self.download_model() tabla_3 = pd.read_sql_table('centers', self.engine, schema="transformed") tabla_4 = pd.read_sql_table('inspections', self.engine, schema="transformed") centros = tabla_3.copy() centros.rename(columns={"dc_id":"center_id"}, inplace=True) inspecciones = tabla_4.copy() last_inspections = inspecciones.sort_values(by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last") centros = centros.drop(['centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status', 'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual','violationratepercent','violationavgratepercent', 'publichealthhazardviolationrate','averagepublichealthhazardiolationrate','criticalviolationrate','avgcriticalviolationrate'], axis=1) centros = centros.reset_index(drop=True) tabla_5 = pd.merge(last_inspections, centros) tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True) tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int) tabla_5['averagetotaleducationalworkers'] = tabla_5['averagetotaleducationalworkers'].astype(float) tabla_5 = tabla_5.drop(['regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan'], axis=1) tabla_5 = tabla_5.set_index(['center_id']) tabla_5 = tabla_5.fillna(0) for col in tabla_5.select_dtypes(object): tabla_5[col] = tabla_5[col].astype(float) tabla_5 = tabla_5.fillna(0) prds = model.predict(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1)) probas = model.predict_proba(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1)) res = pd.DataFrame({ "center":tabla_5.index, "etiqueta":prds, "proba_0":probas[:,0], "proba_1":probas[:,1] }) res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0'] res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1'] categorias_1 = ["programtype_all_age_camp","programtype_infant_toddler","programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp"] programtype = pd.get_dummies(centros[categorias_1]).idxmax(1) categorias_2 = ["borough_bronx","borough_brooklyn","borough_manhattan", "borough_queens", "borough_staten_island"] borough = pd.get_dummies(centros[categorias_2]).idxmax(1) ambas = pd.concat([borough, programtype], axis=1,) ambas = ambas.rename(columns={0:'borough', 1:'programtype'}) tabla_1 = pd.concat([centros, ambas], axis=1) tabla_2 = pd.merge(res, tabla_1, left_on='center', right_on='center_id') for i in list(tabla_2.index): if str(tabla_2.iloc[i].borough_bronx) == "1": tabla_2.loc[tabla_2.index == i ,"borough"] = "bronx" elif str(tabla_2.iloc[i].borough_brooklyn) == "1": tabla_2.loc[tabla_2.index == i ,"borough"] = "brooklyn" elif str(tabla_2.iloc[i].borough_manhattan) == "1": tabla_2.loc[tabla_2.index == i ,"borough"] = "manhattan" elif str(tabla_2.iloc[i].borough_queens) == "1": tabla_2.loc[tabla_2.index == i ,"borough"] = "queens" elif str(tabla_2.iloc[i].borough_staten_island) == "1": tabla_2.loc[tabla_2.index == i ,"borough"] = "staten_island" tabla_2.drop(categorias_2, axis=1, inplace=True) for i in list(tabla_2.index): if str(tabla_2.iloc[i].programtype_all_age_camp) == "1": tabla_2.loc[tabla_2.index == i ,"programtype"] = "all_age_camp" elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1": tabla_2.loc[tabla_2.index == i ,"programtype"] = "infant_toddler" elif str(tabla_2.iloc[i].programtype_preschool) == "1": tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool" elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1": tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool_camp" elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1": tabla_2.loc[tabla_2.index == i ,"programtype"] = "school_age_camp" tabla_2.drop(categorias_1, axis=1, inplace=True) tabla_6 = tabla_2.loc[:, ['center', 'etiqueta', 'score', 'borough', 'programtype']] tabla_6 = tabla_6.rename(columns = {'etiqueta':'label_value'}) tabla_6.set_index('center', inplace=True) g = Group() xtab, _ = g.get_crosstabs(tabla_6) b = Bias() bdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6, ref_groups_dict={'borough':'brooklyn', 'programtype':'preschool'}, alpha=0.05, mask_significance=True) f = Fairness() fdf = f.get_group_value_fairness(bdf) fdf['model_id'] = self.model_id fdf['date'] = self.date_param self.output_table = fdf return [tuple(x) for x in fdf.to_numpy()], [(c.replace("for", "forr").replace(" ", "_"), 'VARCHAR') for c in list(fdf.columns)]
'sex': 'Male' }, alpha=0.05, mask_significance=True) calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) # Mostramos la tabla de metricas de sesgo print(bdf[['attribute_name', 'attribute_value'] + calculated_disparities + disparity_significance]) # Plots de disparidad #aqp.plot_disparity(bdf, group_metric='fpr_disparity', attribute_name='race', significance_alpha=0.05) #j = aqp.plot_disparity_all(bdf, metrics=['precision_disparity', 'fpr_disparity'], attributes=['age_cat'], significance_alpha=0.05) # Definimos las medidas de equidad a partir de la tabla de metricas de sesgo f = Fairness() # Establecemos el valor del umbral con la variable tau fdf = f.get_group_value_fairness(bdf, tau=0.8) #parity_detrminations = f.list_parities(fdf) # Tabla con si se cumplen las medidas de equidad para cada atributo gaf = f.get_group_attribute_fairness(fdf) #print(gaf['Equalized Odds']) # Metricas de grupo y de sesgo una vez aplicados los umbrales de equidad fg = aqp.plot_fairness_group_all( fdf, ncols=2, metrics=['ppr', 'pprev', 'fdr', 'fpr', 'for', 'fnr']) fg.savefig('./figures/LAW_DATA/disparity_group_law.png') m = aqp.plot_fairness_disparity_all(fdf, metrics=['for', 'fnr'], attributes=['race']) m.savefig('./figures/LAW_DATA/disparity_law_race.png')
def run_aequitas(predictions_data_path): ''' Check for False negative rate, chances of certain group missing out on assistance using aequitas toolkit The functions transform the data to make it aequitas complaint and checks for series of bias and fairness metrics Input: model prediction path for the selected model (unzip the selected file to run) Output: plots saved in charts folder ''' best_model_pred = pd.read_csv(predictions_data_path) # Transform data for aquetias module compliance aqc = [ 'Other', 'White', 'African American', 'Asian', 'Hispanic', 'American Indian' ] aqcol = [ 'White alone_scale', 'Black/AfAmer alone_scale', 'AmInd/Alaskn alone_scale', 'Asian alone_scale', 'HI alone_scale', 'Some other race alone_scale', 'Hispanic or Latino_scale' ] display(aqcol) aqcol_label = [ 'no_renew_nextpd', 'pred_class_10%', 'Median household income (1999 dollars)_scale' ] + aqcol aqus = best_model_pred[aqcol_label] print('Creating classes for racial and income distribution', '\n') # Convert to binary bin_var = [ 'no_renew_nextpd', 'pred_class_10%', ] for var in bin_var: aqus[var] = np.where(aqus[var] == True, 1, 0) # Rename aqus.rename(columns={ 'no_renew_nextpd': 'label_value', 'pred_class_10%': 'score' }, inplace=True) print('Define majority rule defined on relative proportion of the class', '\n') aqus['race'] = aqus[aqcol].idxmax(axis=1) # Use quantile income distribution aqus['income'] = pd.qcut( aqus['Median household income (1999 dollars)_scale'], 3, labels=["rich", "median", "poor"]) # Final form aqus.drop(aqcol, axis=1, inplace=True) aqus.drop(['Median household income (1999 dollars)_scale'], axis=1, inplace=True) aq = aqus.reset_index() aq.rename(columns={'index': 'entity_id'}, inplace=True) aq['race'] = aq['race'].replace({ 'Some other race alone_scale': 'Other', 'White alone_scale': 'White', 'Black/AfAmer alone_scale': 'African American', 'Asian alone_scale': 'Asian', 'HI alone_scale': 'Hispanic', 'AmInd/Alaskn alone_scale': 'American Indian' }) # Consolidate types aq['income'] = aq['income'].astype(object) aq['entity_id'] = aq['entity_id'].astype(object) aq['score'] = aq['score'].astype(object) aq['label_value'] = aq['label_value'].astype(object) # Distribuion of categories aq_palette = sns.diverging_palette(225, 35, n=2) by_race = sns.countplot(x="race", data=aq[aq.race.isin(aqc)]) by_race.set_xticklabels(by_race.get_xticklabels(), rotation=40, ha="right") plt.savefig('charts/Racial distribution in data.png') # Primary distribuion against score aq_palette = sns.diverging_palette(225, 35, n=2) by_race = sns.countplot(x="race", hue="score", data=aq[aq.race.isin(aqc)], palette=aq_palette) by_race.set_xticklabels(by_race.get_xticklabels(), rotation=40, ha="right") # Race plt.savefig('charts/race_score.png') # Income by_inc = sns.countplot(x="income", hue="score", data=aq, palette=aq_palette) plt.savefig('charts/income_score.png') # Set Group g = Group() xtab, _ = g.get_crosstabs(aq) # False Negative Rates aqp = Plot() fnr = aqp.plot_group_metric(xtab, 'fnr', min_group_size=0.05) p = aqp.plot_group_metric_all(xtab, metrics=['ppr', 'pprev', 'fnr', 'fpr'], ncols=4) p.savefig('charts/eth_metrics.png') # Bias with respect to white rich category b = Bias() bdf = b.get_disparity_predefined_groups(xtab, original_df=aq, ref_groups_dict={ 'race': 'White', 'income': 'rich' }, alpha=0.05, mask_significance=True) bdf.style calculated_disparities = b.list_disparities(bdf) disparity_significance = b.list_significance(bdf) aqp.plot_disparity(bdf, group_metric='fpr_disparity', attribute_name='race', significance_alpha=0.05) plt.savefig('charts/disparity.png') # Fairness hbdf = b.get_disparity_predefined_groups(xtab, original_df=aq, ref_groups_dict={ 'race': 'African American', 'income': 'poor' }, alpha=0.05, mask_significance=False) majority_bdf = b.get_disparity_major_group(xtab, original_df=aq, mask_significance=True) min_metric_bdf = b.get_disparity_min_metric(df=xtab, original_df=aq) f = Fairness() fdf = f.get_group_value_fairness(bdf) parity_detrminations = f.list_parities(fdf) gaf = f.get_group_attribute_fairness(fdf) gof = f.get_overall_fairness(fdf) z = aqp.plot_fairness_group(fdf, group_metric='ppr') plt.savefig('charts/fairness_overall.png') # Checking for False Omission Rate and False Negative Rates fg = aqp.plot_fairness_group_all(fdf, metrics=['for', 'fnr'], ncols=2) fg.savefig('charts/fairness_metrics.png') return None
def fun_bias_fair(a_zip, a_type, fea_eng, model): X = fea_eng.drop([ 'aka_name', 'facility_type', 'address', 'inspection_date', 'inspection_type', 'violations', 'results', 'pass' ], axis=1) y_pred = model.predict(X) xt = pd.DataFrame([ fea_eng['zip'].astype(float), fea_eng['facility_type'], fea_eng['pass'], y_pred ]).transpose() a_zip['zip'] = a_zip['zip'].astype(float) compas = pd.merge(left=xt, right=a_zip, how='left', left_on='zip', right_on='zip') compas = pd.merge(left=compas, right=a_type, how='left', left_on='facility_type', right_on='facility_type') compas = compas.rename(columns={ 'Unnamed 0': 'score', 'pass': '******' }) compas.pop('zip') compas.pop('facility_type') compas['zone'] = compas['zone'].astype(str) compas['score'] = compas['score'].astype(int) compas['label_value'] = compas['label_value'].astype(int) from aequitas.group import Group from aequitas.bias import Bias from aequitas.fairness import Fairness #Group g = Group() xtab, attrbs = g.get_crosstabs(compas) absolute_metrics = g.list_absolute_metrics(xtab) xtab[[col for col in xtab.columns if col not in absolute_metrics]] group_df = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) abs_gpo = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) #Bias bias = Bias() bdf = bias.get_disparity_predefined_groups(xtab, original_df=compas, ref_groups_dict={ 'zone': 'West', 'facility_group': 'grocery' }, alpha=0.05) # View disparity metrics added to dataframe bias_bdf = bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(bdf)].round(2) majority_bdf = bias.get_disparity_major_group(xtab, original_df=compas) bias_maj_bdf = majority_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(majority_bdf)].round(2) min_bdf = bias.get_disparity_min_metric(xtab, original_df=compas) bias_min_bdf = min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) #Fairness fair = Fairness() fdf = fair.get_group_value_fairness(bdf) parity_determinations = fair.list_parities(fdf) fair_fdf = fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias.list_disparities(fdf) + parity_determinations].round(2) gaf = fair.get_group_attribute_fairness(fdf) fairness_df = fdf.copy() gof = fair.get_overall_fairness(fdf) tab_bias_fair = fair_fdf[[ 'attribute_name', 'attribute_value', 'for', 'fnr', 'for_disparity', 'fnr_disparity', 'FOR Parity', 'FNR Parity' ]] tab_bias_fair.rename(columns={ 'attribute_value': 'group_name', 'FOR Parity': 'for_parity', 'FNR Parity': 'fnr_parity', 'for': 'for_' }, inplace=True) print(tab_bias_fair) return tab_bias_fair
def aq_analysis(arguments): # Import result set from best model. result_set = pd.read_csv(arguments.result_set) \ .drop_duplicates(subset="block_group") \ .rename(columns={"successful": "label_value"}) # Drop columns not needed for analysis. features_to_drop = [ column for column in result_set.columns if column in DROP_COLUMN_KEYWORDS and "count" not in column ] result_set = result_set.drop(columns=features_to_drop) # Initialize base comparison attributes dictionary. base_comparison = {"pct_white": None, "pct_high_income": None} base_comparison_label = "_".join(base_comparison.keys()) # Preprocess outside of Aequitas because preprocess_input_df() doesn't work. for column in result_set.columns: if column == "score": result_set[column] = result_set[column].astype(float) elif column == "label_value": result_set[column] = result_set[column].astype(int) else: if result_set[column].nunique() > 1: result_set[column], bins = pd.qcut(x=result_set[column], q=4, precision=2, duplicates="drop", retbins=True) # Save label of highest quartile for base comparison attributes. if column in base_comparison: lb = str(round(bins[3], 2)) ub = str(round(bins[4], 2)) base_comparison[column] = "(" + lb + ", " + ub + "]" result_set[column] = result_set[column].astype(str) # Initialize Aequitas objects and export directory. aqg, aqb, aqf, aqp = Group(), Bias(), Fairness(), Plot() directory = "aequitas" if not os.path.exists(directory): os.makedirs(directory) # Calculate crosstabs by distinct group. crosstabs, _ = aqg.get_crosstabs( df=result_set, score_thresholds={"score": [float(arguments.threshold)]}) absolute_metrics = aqg.list_absolute_metrics(crosstabs) crosstabs[["attribute_name", "attribute_value"] + absolute_metrics] \ .round(2) \ .to_csv(directory + "/aequitas_crosstabs.csv", index=False) # Plot bias and fairness with respect to white, high income communities. disparity_white_hiinc = aqb.get_disparity_predefined_groups( crosstabs.loc[crosstabs["attribute_name"].isin( base_comparison.keys())], result_set, base_comparison) a = aqp.plot_disparity_all(disparity_white_hiinc, metrics=METRICS, show_figure=False) a_filename = "bias_ref_" + base_comparison_label + ".png" a.savefig(directory + "/" + a_filename) b = aqp.plot_fairness_disparity_all( aqf.get_group_value_fairness(disparity_white_hiinc), metrics=METRICS, show_figure=False) b_filename = "fairness_ref_" + base_comparison_label + ".png" b.savefig(directory + "/" + b_filename)
def audit(df, configs, preprocessed=False): """ :param df: :param configs: :param preprocessed: :return: """ if not preprocessed: df, attr_cols_input = preprocess_input_df(df) if not configs.attr_cols: configs.attr_cols = attr_cols_input g = Group() print('Welcome to Aequitas-Audit') print('Fairness measures requested:', ','.join(configs.fair_measures_requested)) groups_model, attr_cols = g.get_crosstabs( df, score_thresholds=configs.score_thresholds, attr_cols=configs.attr_cols) print('audit: df shape from the crosstabs:', groups_model.shape) b = Bias() # todo move this to the new configs object / the attr_cols now are passed through the configs object... ref_groups_method = configs.ref_groups_method if ref_groups_method == 'predefined' and configs.ref_groups: bias_df = b.get_disparity_predefined_groups( groups_model, df, configs.ref_groups, check_significance=configs.check_significance, alpha=configs.alpha, selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) elif ref_groups_method == 'majority': bias_df = b.get_disparity_major_group( groups_model, df, check_significance=configs.check_significance, alpha=configs.alpha, selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) else: bias_df = b.get_disparity_min_metric( df=groups_model, original_df=df, check_significance=configs.check_significance, alpha=configs.alpha, label_score_ref='fpr', selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) print('Any NaN?: ', bias_df.isnull().values.any()) print('bias_df shape:', bias_df.shape) aqp = Plot() if configs.plot_bias_metrics: if len(configs.plot_bias_metrics) == 1: fig1 = aqp.plot_group_metric( bias_df, group_metric=configs.plot_bias_metrics[0]) elif len(configs.plot_bias_metrics) > 1: fig1 = aqp.plot_group_metric_all( bias_df, metrics=configs.plot_disparity_attributes) if (len(configs.plot_bias_disparities) == 1) and (len( configs.plot_disparity_attributes) == 1): fig2 = aqp.plot_disparity( bias_df, group_metric=configs.plot_bias_disparities[0], attribute_name=configs.plot_disparity_attributes[0]) elif (len(configs.plot_bias_disparities) > 1) or (len( configs.plot_disparity_attributes) > 1): fig2 = aqp.plot_disparity_all( bias_df, metrics=configs.plot_bias_disparities, attributes=configs.plot_disparity_attributes) f = Fairness(tau=configs.fairness_threshold) print('Fairness Threshold:', configs.fairness_threshold) print('Fairness Measures:', configs.fair_measures_requested) group_value_df = f.get_group_value_fairness( bias_df, fair_measures_requested=configs.fair_measures_requested) group_attribute_df = f.get_group_attribute_fairness( group_value_df, fair_measures_requested=configs.fair_measures_requested) fair_results = f.get_overall_fairness(group_attribute_df) if configs.plot_bias_metrics: if len(configs.plot_bias_metrics) == 1: fig3 = aqp.plot_fairness_group( group_value_df, group_metric=configs.plot_bias_metrics[0]) elif len(configs.plot_bias_metrics) > 1: fig3 = aqp.plot_fairness_group_all( group_value_df, metrics=configs.plot_bias_metrics) if (len(configs.plot_bias_disparities) == 1) and (len( configs.plot_disparity_attributes) == 1): fig4 = aqp.plot_fairness_disparity( group_value_df, group_metric=configs.plot_bias_disparities[0], attribute_name=configs.plot_disparity_attributes[0]) elif (len(configs.plot_bias_disparities) > 1) or (len( configs.plot_disparity_attributes) > 1): fig4 = aqp.plot_fairness_disparity_all( group_value_df, metrics=configs.plot_bias_disparities, attributes=configs.plot_disparity_attributes) print(fair_results) report = None if configs.report is True: report = audit_report_markdown(configs, group_value_df, f.fair_measures_depend, fair_results) return group_value_df, report