def execeute(self): model = self.download_model() tabla_3 = pd.read_sql_table('centers', self.engine, schema="transformed") tabla_4 = pd.read_sql_table('inspections', self.engine, schema="transformed") centros = tabla_3.copy() centros.rename(columns={"dc_id": "center_id"}, inplace=True) inspecciones = tabla_4.copy() last_inspections = inspecciones.sort_values( by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last") centros = centros.drop([ 'centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status', 'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual', 'violationratepercent', 'violationavgratepercent', 'publichealthhazardviolationrate', 'averagepublichealthhazardiolationrate', 'criticalviolationrate', 'avgcriticalviolationrate' ], axis=1) centros = centros.reset_index(drop=True) tabla_5 = pd.merge(last_inspections, centros) tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True) tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5[ 'totaleducationalworkers'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5[ 'totaleducationalworkers'].astype(int) tabla_5['averagetotaleducationalworkers'] = tabla_5[ 'averagetotaleducationalworkers'].astype(float) tabla_5 = tabla_5.drop([ 'regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan' ], axis=1) tabla_5 = tabla_5.set_index(['center_id']) tabla_5 = tabla_5.fillna(0) for col in tabla_5.select_dtypes(object): tabla_5[col] = tabla_5[col].astype(float) tabla_5 = tabla_5.fillna(0) prds = model.predict( tabla_5.drop(['violationcategory_public_health_hazard'], axis=1)) probas = model.predict_proba( tabla_5.drop(['violationcategory_public_health_hazard'], axis=1)) res = pd.DataFrame({ "center": tabla_5.index, "etiqueta": prds, "proba_0": probas[:, 0], "proba_1": probas[:, 1] }) res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0'] res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1'] categorias_1 = [ "programtype_all_age_camp", "programtype_infant_toddler", "programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp" ] programtype = pd.get_dummies(centros[categorias_1]).idxmax(1) categorias_2 = [ "borough_bronx", "borough_brooklyn", "borough_manhattan", "borough_queens", "borough_staten_island" ] borough = pd.get_dummies(centros[categorias_2]).idxmax(1) ambas = pd.concat( [borough, programtype], axis=1, ) ambas = ambas.rename(columns={0: 'borough', 1: 'programtype'}) tabla_1 = pd.concat([centros, ambas], axis=1) tabla_2 = pd.merge(res, tabla_1, left_on='center', right_on='center_id') for i in list(tabla_2.index): if str(tabla_2.iloc[i].borough_bronx) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "bronx" elif str(tabla_2.iloc[i].borough_brooklyn) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "brooklyn" elif str(tabla_2.iloc[i].borough_manhattan) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "manhattan" elif str(tabla_2.iloc[i].borough_queens) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "queens" elif str(tabla_2.iloc[i].borough_staten_island) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "staten_island" tabla_2.drop(categorias_2, axis=1, inplace=True) for i in list(tabla_2.index): if str(tabla_2.iloc[i].programtype_all_age_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "all_age_camp" elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "infant_toddler" elif str(tabla_2.iloc[i].programtype_preschool) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "preschool" elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "preschool_camp" elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "school_age_camp" tabla_2.drop(categorias_1, axis=1, inplace=True) tabla_6 = tabla_2.loc[:, [ 'center', 'etiqueta', 'score', 'borough', 'programtype' ]] tabla_6 = tabla_6.rename(columns={'etiqueta': 'label_value'}) tabla_6.set_index('center', inplace=True) g = Group() xtab, _ = g.get_crosstabs(tabla_6) b = Bias() bdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6, ref_groups_dict={ 'borough': 'brooklyn', 'programtype': 'preschool' }, alpha=0.05, mask_significance=True) f = Fairness() fdf = f.get_group_value_fairness(bdf) fdf['model_id'] = self.model_id fdf['date'] = self.date_param self.output_table = fdf return [tuple(x) for x in fdf.to_numpy() ], [(c.replace("for", "forr").replace(" ", "_"), 'VARCHAR') for c in list(fdf.columns)]
def audit(df, configs, preprocessed=False): """ :param df: :param configs: :param preprocessed: :return: """ if not preprocessed: df, attr_cols_input = preprocess_input_df(df) if not configs.attr_cols: configs.attr_cols = attr_cols_input g = Group() print('Welcome to Aequitas-Audit') print('Fairness measures requested:', ','.join(configs.fair_measures_requested)) groups_model, attr_cols = g.get_crosstabs( df, score_thresholds=configs.score_thresholds, attr_cols=configs.attr_cols) print('audit: df shape from the crosstabs:', groups_model.shape) b = Bias() # todo move this to the new configs object / the attr_cols now are passed through the configs object... ref_groups_method = configs.ref_groups_method if ref_groups_method == 'predefined' and configs.ref_groups: bias_df = b.get_disparity_predefined_groups( groups_model, df, configs.ref_groups, check_significance=configs.check_significance, alpha=configs.alpha, selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) elif ref_groups_method == 'majority': bias_df = b.get_disparity_major_group( groups_model, df, check_significance=configs.check_significance, alpha=configs.alpha, selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) else: bias_df = b.get_disparity_min_metric( df=groups_model, original_df=df, check_significance=configs.check_significance, alpha=configs.alpha, label_score_ref='fpr', selected_significance=configs.selected_significance, mask_significance=configs.mask_significance) print('Any NaN?: ', bias_df.isnull().values.any()) print('bias_df shape:', bias_df.shape) aqp = Plot() if configs.plot_bias_metrics: if len(configs.plot_bias_metrics) == 1: fig1 = aqp.plot_group_metric( bias_df, group_metric=configs.plot_bias_metrics[0]) elif len(configs.plot_bias_metrics) > 1: fig1 = aqp.plot_group_metric_all( bias_df, metrics=configs.plot_disparity_attributes) if (len(configs.plot_bias_disparities) == 1) and (len( configs.plot_disparity_attributes) == 1): fig2 = aqp.plot_disparity( bias_df, group_metric=configs.plot_bias_disparities[0], attribute_name=configs.plot_disparity_attributes[0]) elif (len(configs.plot_bias_disparities) > 1) or (len( configs.plot_disparity_attributes) > 1): fig2 = aqp.plot_disparity_all( bias_df, metrics=configs.plot_bias_disparities, attributes=configs.plot_disparity_attributes) f = Fairness(tau=configs.fairness_threshold) print('Fairness Threshold:', configs.fairness_threshold) print('Fairness Measures:', configs.fair_measures_requested) group_value_df = f.get_group_value_fairness( bias_df, fair_measures_requested=configs.fair_measures_requested) group_attribute_df = f.get_group_attribute_fairness( group_value_df, fair_measures_requested=configs.fair_measures_requested) fair_results = f.get_overall_fairness(group_attribute_df) if configs.plot_bias_metrics: if len(configs.plot_bias_metrics) == 1: fig3 = aqp.plot_fairness_group( group_value_df, group_metric=configs.plot_bias_metrics[0]) elif len(configs.plot_bias_metrics) > 1: fig3 = aqp.plot_fairness_group_all( group_value_df, metrics=configs.plot_bias_metrics) if (len(configs.plot_bias_disparities) == 1) and (len( configs.plot_disparity_attributes) == 1): fig4 = aqp.plot_fairness_disparity( group_value_df, group_metric=configs.plot_bias_disparities[0], attribute_name=configs.plot_disparity_attributes[0]) elif (len(configs.plot_bias_disparities) > 1) or (len( configs.plot_disparity_attributes) > 1): fig4 = aqp.plot_fairness_disparity_all( group_value_df, metrics=configs.plot_bias_disparities, attributes=configs.plot_disparity_attributes) print(fair_results) report = None if configs.report is True: report = audit_report_markdown(configs, group_value_df, f.fair_measures_depend, fair_results) return group_value_df, report
y_test = Y_test.rename(columns={"y": "label_value"}) y_test.reset_index(drop=True, inplace=True) y_test.shape #Unimos los dataframe para generar el input data datos_aequitas = pd.concat([predicciones, y_test, X_test, X_test_df], axis=1) #Hasta aquí es la preparación de datos para aequitas ------------------------------------------------------------ #Filtrar los datos de aequitas para calculo de FNR datos_aequitas = datos_aequitas[['score', 'label_value', 'day_sem']] datos_aequitas.head() #Instalación de Aequitas #pip install aequitas g = Group() xtab, _ = g.get_crosstabs(datos_aequitas) #xtab contiene calculos de todas la métricas de FP, FN, TP, TN #Calculo de Bias b = Bias() bdf = b.get_disparity_predefined_groups( xtab, original_df=datos_aequitas, ref_groups_dict={'day_sem': 'e:viernes'}, alpha=0.05, check_significance=False) #Calculo de Fairness f = Fairness() fdf = f.get_group_value_fairness(bdf) #Mismo grupo de referencia
# Using the gender and race fields, we will prepare the data for the Aequitas Toolkit. # In[165]: # Aequitas from aequitas.preprocessing import preprocess_input_df from aequitas.group import Group from aequitas.plotting import Plot from aequitas.bias import Bias from aequitas.fairness import Fairness ae_subset_df = pred_test_df[['race', 'gender', 'score', 'label_value']] ae_df, _ = preprocess_input_df(ae_subset_df) g = Group() xtab, _ = g.get_crosstabs(ae_df) absolute_metrics = g.list_absolute_metrics(xtab) clean_xtab = xtab.fillna(-1) aqp = Plot() b = Bias() # ## Reference Group Selection # Below we have chosen the reference group for our analysis but feel free to select another one. # In[166]: # test reference group with Caucasian Male
def _write_audit_to_db(self, model_id, protected_df, predictions_proba, labels, tie_breaker, subset_hash, matrix_type, evaluation_start_time, evaluation_end_time, matrix_uuid): """ Runs the bias audit and saves the result in the bias table. Args: model_id (int) primary key of the model protected_df (pandas.DataFrame) A dataframe with protected group attributes: predictions_proba (np.array) List of prediction probabilities labels (pandas.Series): List of labels tie_breaker: 'best' or 'worst' case tiebreaking rule that the predictions and labels were sorted by subset_hash (str) the hash of the subset, if any, that the evaluation is made on matrix_type (triage.component.catwalk.storage.MatrixType) The type of matrix used evaluation_start_time (pandas._libs.tslibs.timestamps.Timestamp) first as_of_date included in the evaluation period evaluation_end_time (pandas._libs.tslibs.timestamps.Timestamp) last as_of_date included in the evaluation period matrix_uuid: the uuid of the matrix Returns: """ if protected_df.empty: return # to preprocess aequitas requires the following columns: # score, label value, model_id, protected attributes # fill out the protected_df, which just has protected attributes at this point protected_df = protected_df.copy() protected_df['model_id'] = model_id protected_df['score'] = predictions_proba protected_df['label_value'] = labels aequitas_df, attr_cols_input = preprocess_input_df(protected_df) # create group crosstabs g = Group() score_thresholds = {} score_thresholds['rank_abs'] = self.bias_config['thresholds'].get( 'top_n', []) # convert 0-100 percentile to 0-1 that Aequitas expects score_thresholds['rank_pct'] = [ value / 100.0 for value in self.bias_config['thresholds'].get('percentiles', []) ] groups_model, attr_cols = g.get_crosstabs( aequitas_df, score_thresholds=score_thresholds, attr_cols=attr_cols_input) # analyze bias from reference groups bias = Bias() ref_groups_method = self.bias_config.get('ref_groups_method', None) if ref_groups_method == 'predefined' and self.bias_config['ref_groups']: bias_df = bias.get_disparity_predefined_groups( groups_model, aequitas_df, self.bias_config['ref_groups']) elif ref_groups_method == 'majority': bias_df = bias.get_disparity_major_group(groups_model, aequitas_df) else: bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df) # analyze fairness for each group f = Fairness(tau=0.8) # the default fairness threshold is 0.8 group_value_df = f.get_group_value_fairness(bias_df) group_value_df['subset_hash'] = subset_hash group_value_df['tie_breaker'] = tie_breaker group_value_df['evaluation_start_time'] = evaluation_start_time group_value_df['evaluation_end_time'] = evaluation_end_time group_value_df['matrix_uuid'] = matrix_uuid group_value_df = group_value_df.rename(index=str, columns={ "score_threshold": "parameter", "for": "for_" }) if group_value_df.empty: raise ValueError(f""" Bias audit: aequitas_audit() failed. Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash} and matrix_type = {matrix_type}""") with scoped_session(self.db_engine) as session: for index, row in group_value_df.iterrows(): session.query(matrix_type.aequitas_obj).filter_by( model_id=row['model_id'], evaluation_start_time=row['evaluation_start_time'], evaluation_end_time=row['evaluation_end_time'], subset_hash=row['subset_hash'], parameter=row['parameter'], tie_breaker=row['tie_breaker'], matrix_uuid=row['matrix_uuid'], attribute_name=row['attribute_name'], attribute_value=row['attribute_value']).delete() session.bulk_insert_mappings( matrix_type.aequitas_obj, group_value_df.to_dict(orient="records"))