def get_disparity_min_metric(self, df, original_df, key_columns=None, input_group_metrics=None, fill_divbyzero=None, check_significance=False, alpha=5e-2, mask_significance=True, label_score_ref='fpr', selected_significance=False): """ Calculates disparities between groups for the predefined list of group metrics using the group with the minimum value for each absolute bias metric as the reference group (denominator). :param df: output dataframe of Group class get_crosstabs() method. :param original_df: a dataframe of sample features and model results. Includes a required 'score 'column and possible 'label_value' column. :param key_columns: optional, key identifying columns for grouping variables and bias metrics in intermediate joins. Defaults are 'model_id', 'score_threshold', 'attribute_name'. :param input_group_metrics: optional, columns list corresponding to the group metrics for which we want to calculate disparity values :param fill_divbyzero: optional, fill value to use when divided by zero. Default is None. :param check_significance: whether to determine statistical signifance for disparity metrics. Default is False. :param selected_significance: specific measures (beyond label_value and score) to which to limit statistical significance calculations when check_significance is True. Default is False, i.e., significance will be calculated for all metrics. :param alpha: statistical significance level to use in significance determination. Default is 5e-2 (0.05). :param mask_significance: whether to display a T/F mask over calculated p-values from statistical significance determination. Default is True. :param label_score_ref: default reference group to use for score and label_value statistical significance calculations. :return: A dataframe with same number of rows as the input (crosstab) with additional disparity metrics columns and ref_group_values for each metric. """ print('get_disparity_min_metric()') # record df column order original_cols = df.columns if not key_columns: key_columns = original_cols.intersection(self.key_columns).tolist() if not input_group_metrics: input_group_metrics = self.input_group_metrics if not fill_divbyzero: fill_divbyzero = self.fill_divbyzero for group_metric in input_group_metrics: try: # this groupby is being called every cycle. maybe we can create # a list of df_groups and merge df at the end? it can not be # simply put outside the loop(the merge...) idxmin = df.groupby(key_columns)[group_metric].idxmin() # if entire column for a group metric is NaN, cast min value index # column to the same index as that of any other group for that attribute if any(np.isnan(val) for val in idxmin.values): if len(idxmin) >= 1: idxmin_not_nan = df.reset_index().groupby( key_columns)['index'].min() idxmin.loc[idxmin.isna()] = pd.merge( left=idxmin.loc[idxmin.isna()], right=idxmin_not_nan, left_index=True, right_index=True, how='inner', )['index'].values else: raise Exception( f"A minimum value for group_metric " f"{group_metric} could not be calculated.") df_min_idx = df.loc[idxmin] # but we also want to get the group_value of the reference group # for each bias metric df_to_merge = pd.DataFrame() df_to_merge[key_columns + [group_metric + '_disparity', group_metric + '_ref_group_value']] = \ df_min_idx[key_columns + [group_metric, 'attribute_value']] except KeyError: raise KeyError( 'get_bias_min_metric:: one of the following columns is not ' 'on the input dataframe : model_id, parameter, attribute_name ' 'or any of the input_group_metrics ' 'list') df = df.merge(df_to_merge, on=key_columns) # creating disparity by dividing each group metric value by the # corresponding min value from the groups of the target attribute df[group_metric + '_disparity'] = \ df[group_metric] / df[group_metric + '_disparity'] # We are capping the disparity values to 10.0 when divided by zero... df = df.replace(np.inf, fill_divbyzero) if not check_significance: return df else: # add statistical_significance if not selected_significance: selected_significance = self.input_group_metrics # only proceed with columns actually in dataframe selected_significance = set( original_cols.intersection(selected_significance)) # always includes label and score significance selected_significance = selected_significance.union( {'label_value', 'score'}) ref_groups_dict = assemble_ref_groups( df, ref_group_flag='_ref_group_value', specific_measures=selected_significance, label_score_ref=label_score_ref) attr_cols = df['attribute_name'].unique() # run significance method on bias-augmented crosstab based on false # positives, false negatives, scores, and label values in original df self._get_statistical_significance( original_df, df, ref_dict=ref_groups_dict, score_thresholds=None, attr_cols=attr_cols, alpha=5e-2, selected_significance=selected_significance) # if specified, apply T/F mask to significance columns if mask_significance: significance_cols = df.columns[df.columns.str.contains( '_significance')] truemask = df.loc[:, significance_cols] < alpha falsemask = df.loc[:, significance_cols] >= alpha df.loc[:, significance_cols] = np.select([truemask, falsemask], [True, False], default=None) # order new calculated metric columns: disparity, ref_group, then # significance for each base_sig = ['label_value_significance', 'score_significance'] new_cols = sorted( list(set(df.columns) - set(original_cols) - set(base_sig))) return df[original_cols.tolist() + base_sig + new_cols]
def get_disparity_min_metric(self, df, original_df, key_columns=None, input_group_metrics=None, fill_divbyzero=None, check_significance=None, alpha=5e-2, mask_significance=True, label_score_ref='fpr'): """ Calculates several ratios using the group metrics value and dividing by the minimum group metric value among all groups defined by each attribute :param df: the resulting dataframe from the group get_crosstabs :param original_df: a dataframe containing a required 'score 'column and possible 'label_value' column. :param key_columns: optional, the key columns to use on joins. Defualts are 'model_id', 'score_threshold', 'attribute_name'. :param input_group_metrics: optional, the columns list corresponding to the group metrics for which we want to calculate disparity values :param fill_divbyzero: optional, fill value to use when divided by zero :param check_significance: Measures for which to determine statistical significance beyond label_value and score. Defaults are 'fpr' and 'fnr'. :param alpha: Level at which to determine statistical significance. Default is 5e-2 (0.05). :param mask_significance: Whether to display a T/F mask over calculated p-values from statistical significance determination. Default is True. :param label_score_ref: Default reference group to use for score and label_value statistical significance calculations :return: a dataframe with same number of rows as the input (crosstab) but with additional disparity metrics columns, ref_group_values, and statisitcal significance of specific metrics. """ print('get_disparity_min_metric()') # record df column order df_cols = df.columns if not key_columns: key_columns = df_cols.intersection(self.key_columns).tolist() if not input_group_metrics: input_group_metrics = self.input_group_metrics if not fill_divbyzero: fill_divbyzero = self.fill_divbyzero if not check_significance: check_significance = self.significance_cols for group_metric in input_group_metrics: try: # this groupby is being called every cycle. maybe we can create # a list of df_groups and merge df at the end? it can not be # simply put outside the loop(the merge...) idxmin = df.groupby(key_columns)[group_metric].idxmin() # if entire column for a group metric is NaN, cast min value # index column to same index as any other group for that attribute if any(pd.np.isnan(val) for val in idxmin.values): if (len(idxmin) >= 1): idxmin.loc[idxmin.isna()] = df.loc[ df["attribute_name"].isin( idxmin.index.get_level_values( 'attribute_name').values)].index[0] else: logging.error( f"A minimum value for group_metric " f"{group_metric} could not be calculated.") continue df_min_idx = df.loc[idxmin] # but we also want to get the group_value of the reference group # for each bias metric df_to_merge = pd.DataFrame() df_to_merge[key_columns + [group_metric + '_disparity', group_metric + '_ref_group_value']] = \ df_min_idx[key_columns + [group_metric, 'attribute_value']] except KeyError: logging.error( 'get_bias_min_metric:: one of the following columns is not ' 'on the input dataframe : model_id, parameter, attribute_name ' 'or any of the input_group_metrics ' 'list') df = df.merge(df_to_merge, on=key_columns) # creating disparity by dividing each group metric value by the # corresponding min value from the groups of the target attribute df[group_metric + '_disparity'] = \ df[group_metric] / df[group_metric + '_disparity'] # We are capping the disparity values to 10.0 when divided by zero... df = df.replace(pd.np.inf, fill_divbyzero) # add statistical_significance check_significance = df_cols.intersection(check_significance).tolist() ref_groups_dict = assemble_ref_groups( df, ref_group_flag='_ref_group_value', specific_measures=check_significance, label_score_ref=label_score_ref) attr_cols = df['attribute_name'].unique() # run significance method on bias-augmented crosstab based on false # positives, false negatives, scores, and label values in original df self.get_statistical_significance(original_df, df, ref_dict=ref_groups_dict, score_thresholds=None, model_id=1, attr_cols=attr_cols, aplha=5e-2) # if specified, apply T/F mask to significance columns if mask_significance: significance_cols = df.columns[df.columns.str.contains( '_significance')] truemask = df.loc[:, significance_cols] < alpha falsemask = df.loc[:, significance_cols] >= alpha df.loc[:, significance_cols] = pd.np.select([truemask, falsemask], [True, False], default=None) # check what new disparity columns are and order as disparity, # ref_group, significance for each new_cols = sorted( list( set(df.columns) - set(df_cols) - {'label_value_significance', 'score_significance'})) return df[df_cols.tolist() + ['label_value_significance', 'score_significance'] + new_cols]
def get_disparity_major_group(self, df, original_df, key_columns=None, input_group_metrics=None, fill_divbyzero=None, check_significance=False, alpha=5e-2, mask_significance=True, selected_significance=False): """ Calculates disparities between groups for the predefined list of group metrics using the majority group within each attribute as the reference group (denominator). :param df: output dataframe of Group class get_crosstabs() method. :param original_df: a dataframe of sample features and model results. Includes a required 'score 'column and possible 'label_value' column. :param key_columns: optional, key identifying columns for grouping variables and bias metrics in intermediate joins. Defaults are 'model_id', 'score_threshold', 'attribute_name'. :param input_group_metrics: optional, columns list corresponding to the group metrics for which we want to calculate disparity values :param fill_divbyzero: optional, fill value to use when divided by zero. Default is None. :param check_significance: whether to determine statistical signifance for disparity metrics. Default is False. :param selected_significance: specific measures (beyond label_value and score) to which to limit statistical significance calculations when check_significance is True. Default is False, i.e., significance will be calculated for all metrics. :param alpha: statistical significance level to use in significance determination. Default is 5e-2 (0.05). :param mask_significance: whether to display a T/F mask over calculated p-values from statistical significance determination. Default is True. :return: A dataframe with same number of rows as the input (crosstab) with additional disparity metrics columns and ref_group_values for each metric. """ print('get_disparity_major_group()') # record df column order original_cols = df.columns if not key_columns: key_columns = original_cols.intersection(self.key_columns).tolist() if not input_group_metrics: input_group_metrics = self.input_group_metrics if not fill_divbyzero: fill_divbyzero = self.fill_divbyzero try: df_major_group = df.loc[df.groupby(key_columns) ['group_size'].idxmax()] except KeyError: raise KeyError( 'get_bias_major_group:: one of the following columns ' 'is not on the input dataframe : model_id, parameter, ' 'attribute_name, group_size') disparity_metrics = [col + '_disparity' for col in input_group_metrics] df_to_merge = pd.DataFrame() # we created the df_to_merge has a subset of the df_ref_group containing # the target ref group values which are now labeled as _disparity but # we still need to perform the division df_to_merge[key_columns + disparity_metrics] = df_major_group[key_columns + input_group_metrics] # we now need to create the ref_group_value columns in the df_to_merge for col in input_group_metrics: df_to_merge[col + '_ref_group_value'] = df_major_group['attribute_value'] df = df.merge(df_to_merge, on=key_columns) df[disparity_metrics] = df[input_group_metrics].divide( df[disparity_metrics].values) # We are capping the disparity values to 10.0 when divided by zero... df = df.replace(np.inf, fill_divbyzero) # when there is a zero in the numerator and a zero in denominator it is # considered NaN after division, so if 0/0 we assume 1.0 disparity # (they are the same...) # default is to use the same ref groups as df, need to add functionality to # compile ref_groups_dict based on a passed ref group for a given measure if not check_significance: return df else: if not selected_significance: selected_significance = self.input_group_metrics # only proceed with columns actually in dataframe selected_significance = set( original_cols.intersection(selected_significance)) # always includes label and score significance selected_significance = selected_significance.union( {'label_value', 'score'}) ref_groups_dict = assemble_ref_groups( df, ref_group_flag='_ref_group_value', specific_measures=selected_significance, label_score_ref=None) ref_groups_dict = assemble_ref_groups( df, ref_group_flag='_ref_group_value', specific_measures=selected_significance, label_score_ref=None) attr_cols = df['attribute_name'].unique() for attribute in attr_cols: largest_group = df_major_group.loc[ df_major_group['attribute_name'] == attribute, 'attribute_value'].values.tolist()[0] ref_groups_dict[attribute]['label_value'] = largest_group ref_groups_dict[attribute]['score'] = largest_group # run significance method on bias-augmented crosstab based on false # positives, false negatives, scores, and label values in original df self._get_statistical_significance( original_df, df, ref_dict=ref_groups_dict, score_thresholds=None, attr_cols=attr_cols, alpha=5e-2, selected_significance=selected_significance) # if specified, apply T/F mask to significance columns if mask_significance: significance_cols = df.columns[df.columns.str.contains( '_significance')] truemask = df.loc[:, significance_cols] < alpha falsemask = df.loc[:, significance_cols] >= alpha df.loc[:, significance_cols] = np.select([truemask, falsemask], [True, False], default=None) # check what new disparity columns are and order as disparity, # ref_group, significance for each base_sig = ['label_value_significance', 'score_significance'] new_cols = sorted( list(set(df.columns) - set(original_cols) - set(base_sig))) return df[original_cols.tolist() + base_sig + new_cols]
def get_disparity_major_group(self, df, original_df, key_columns=None, input_group_metrics=None, fill_divbyzero=None, check_significance=None, alpha=5e-2, mask_significance=True, label_score_ref='fpr'): """ Calculates the bias (disparity) metrics for the predefined list of group metrics using the majority group within each attribute as the reference group (denominator) :param df: the returning dataframe from the group.get_crosstabs :param original_df: a dataframe containing a required 'score 'column and possible 'label_value' column. :param key_columns: optional, the key columns to use on joins. Defaults are 'model_id', 'score_threshold', 'attribute_name'. :param input_group_metrics: optional, the columns list corresponding to the group metrics for which we want to calculate disparity values :param fill_divbyzero: optional, fill value to use when divided by zero :param check_significance: Measures for which to determine statistical significance beyond label_value and score. Defaults are 'fpr' and 'fnr'. :param alpha: Level at which to determine statistical significance. Default is 5e-2 (0.05). :param mask_significance: Whether to display a T/F mask over calculated p-values from statistical significance determination. Default is True. :param label_score_ref: Metric which eference group to use for score and label_value statistical significance calculations. :return: a dataframe with same number of rows as the input (crosstab) but with additional disparity metrics columns, ref_group_values, and statisitcal significance of specific metrics. """ print('get_disparity_major_group()') # record df column order df_cols = df.columns if not key_columns: key_columns = df_cols.intersection(self.key_columns).tolist() if not input_group_metrics: input_group_metrics = self.input_group_metrics if not fill_divbyzero: fill_divbyzero = self.fill_divbyzero if not check_significance: check_significance = self.significance_cols try: df_major_group = df.loc[df.groupby(key_columns) ['group_size'].idxmax()] except KeyError: logging.error( 'get_bias_major_group:: one of the following columns ' 'is not on the input dataframe : model_id, parameter, ' 'attribute_name, group_size') disparity_metrics = [col + '_disparity' for col in input_group_metrics] df_to_merge = pd.DataFrame() # we created the df_to_merge has a subset of the df_ref_group containing # the target ref group values which are now labeled as _disparity but # we still need to perform the division df_to_merge[key_columns + disparity_metrics] = df_major_group[key_columns + input_group_metrics] # we now need to create the ref_group_value columns in the df_to_merge for col in input_group_metrics: df_to_merge[col + '_ref_group_value'] = df_major_group['attribute_value'] df = df.merge(df_to_merge, on=key_columns) df[disparity_metrics] = df[input_group_metrics].divide( df[disparity_metrics].values) # We are capping the disparity values to 10.0 when divided by zero... df = df.replace(pd.np.inf, fill_divbyzero) # when there is a zero in the numerator and a zero in denominator it is # considered NaN after division, so if 0/0 we assume 1.0 disparity # (they are the same...) fill_zeros = {metric: 1.000000 for metric in disparity_metrics} # df = df.fillna(value=fill_zeros) # default is to use the same ref groups as df, need to add functionality to # complie ref_groups_dict based on a passed ref group for a given measure check_significance = df_cols.intersection(check_significance).tolist() ref_groups_dict = assemble_ref_groups( df, ref_group_flag='_ref_group_value', specific_measures=check_significance, label_score_ref=label_score_ref) attr_cols = df['attribute_name'].unique() for attribute in attr_cols: largest_group = df_major_group.loc[ df_major_group['attribute_name'] == attribute, 'attribute_value'].values.tolist()[0] ref_groups_dict[attribute]['label_value'] = largest_group ref_groups_dict[attribute]['score'] = largest_group # run significance method on bias-augmented crosstab based on false # positives, false negatives, scores, and label values in original df self.get_statistical_significance(original_df, df, ref_dict=ref_groups_dict, score_thresholds=None, model_id=1, attr_cols=attr_cols, aplha=5e-2) # if specified, apply T/F mask to significance columns if mask_significance: significance_cols = df.columns[df.columns.str.contains( '_significance')] truemask = df.loc[:, significance_cols] < alpha falsemask = df.loc[:, significance_cols] >= alpha df.loc[:, significance_cols] = pd.np.select([truemask, falsemask], [True, False], default=None) # check what new disparity columns are and order as disparity, # ref_group, significance for each new_cols = sorted( list( set(df.columns) - set(df_cols) - {'label_value_significance', 'score_significance'})) return df[df_cols.tolist() + ['label_value_significance', 'score_significance'] + new_cols]