Esempio n. 1
0
    def get_disparity_min_metric(self,
                                 df,
                                 original_df,
                                 key_columns=None,
                                 input_group_metrics=None,
                                 fill_divbyzero=None,
                                 check_significance=False,
                                 alpha=5e-2,
                                 mask_significance=True,
                                 label_score_ref='fpr',
                                 selected_significance=False):
        """
        Calculates disparities between groups for the predefined list of
        group metrics using the group with the minimum value for each absolute
        bias metric as the reference group (denominator).

        :param df: output dataframe of Group class get_crosstabs() method.
        :param original_df: a dataframe of sample features and model results.
            Includes a required 'score 'column and possible 'label_value' column.
        :param key_columns: optional, key identifying columns for grouping
            variables and bias metrics in intermediate joins. Defaults are
            'model_id', 'score_threshold', 'attribute_name'.
        :param input_group_metrics: optional, columns list corresponding to
            the group metrics for which we want to calculate disparity values
        :param fill_divbyzero: optional, fill value to use when divided by
            zero. Default is None.
        :param check_significance: whether to determine statistical signifance
            for disparity metrics. Default is False.
        :param selected_significance: specific measures (beyond label_value and
            score) to which to limit statistical significance calculations when
            check_significance is True. Default is False, i.e., significance
            will be calculated for all metrics.
        :param alpha: statistical significance level to use in significance
            determination. Default is 5e-2 (0.05).
        :param mask_significance: whether to display a T/F mask over calculated
            p-values from statistical significance determination. Default is True.
        :param label_score_ref: default reference group to use for score and
            label_value statistical significance calculations.
        :return: A dataframe with same number of rows as the input (crosstab)
            with additional disparity metrics columns and ref_group_values
            for each metric.
        """

        print('get_disparity_min_metric()')
        # record df column order
        original_cols = df.columns

        if not key_columns:
            key_columns = original_cols.intersection(self.key_columns).tolist()
        if not input_group_metrics:
            input_group_metrics = self.input_group_metrics
        if not fill_divbyzero:
            fill_divbyzero = self.fill_divbyzero

        for group_metric in input_group_metrics:

            try:
                # this groupby is being called every cycle. maybe we can create
                # a list of df_groups and merge df at the end? it can not be
                # simply put outside the loop(the merge...)
                idxmin = df.groupby(key_columns)[group_metric].idxmin()

                # if entire column for a group metric is NaN, cast min value index
                # column to the same index as that of any other group for that attribute
                if any(np.isnan(val) for val in idxmin.values):
                    if len(idxmin) >= 1:
                        idxmin_not_nan = df.reset_index().groupby(
                            key_columns)['index'].min()
                        idxmin.loc[idxmin.isna()] = pd.merge(
                            left=idxmin.loc[idxmin.isna()],
                            right=idxmin_not_nan,
                            left_index=True,
                            right_index=True,
                            how='inner',
                        )['index'].values
                    else:
                        raise Exception(
                            f"A minimum value for group_metric "
                            f"{group_metric} could not be calculated.")

                df_min_idx = df.loc[idxmin]

                # but we also want to get the group_value of the reference group
                # for each bias metric
                df_to_merge = pd.DataFrame()
                df_to_merge[key_columns + [group_metric + '_disparity', group_metric +
                                           '_ref_group_value']] = \
                    df_min_idx[key_columns + [group_metric, 'attribute_value']]

            except KeyError:
                raise KeyError(
                    'get_bias_min_metric:: one of the following columns is not '
                    'on the input dataframe : model_id, parameter, attribute_name '
                    'or any of the input_group_metrics '
                    'list')

            df = df.merge(df_to_merge, on=key_columns)
            # creating disparity by dividing each group metric value by the
            # corresponding min value from the groups of the target attribute
            df[group_metric + '_disparity'] = \
                df[group_metric] / df[group_metric + '_disparity']
            # We are capping the disparity values to 10.0 when divided by zero...
        df = df.replace(np.inf, fill_divbyzero)

        if not check_significance:
            return df

        else:
            # add statistical_significance
            if not selected_significance:
                selected_significance = self.input_group_metrics

                # only proceed with columns actually in dataframe
            selected_significance = set(
                original_cols.intersection(selected_significance))

            # always includes label and score significance
            selected_significance = selected_significance.union(
                {'label_value', 'score'})

            ref_groups_dict = assemble_ref_groups(
                df,
                ref_group_flag='_ref_group_value',
                specific_measures=selected_significance,
                label_score_ref=label_score_ref)

            attr_cols = df['attribute_name'].unique()
            # run significance method on bias-augmented crosstab based on false
            # positives, false negatives, scores, and label values in original df
            self._get_statistical_significance(
                original_df,
                df,
                ref_dict=ref_groups_dict,
                score_thresholds=None,
                attr_cols=attr_cols,
                alpha=5e-2,
                selected_significance=selected_significance)

            # if specified, apply T/F mask to significance columns
            if mask_significance:
                significance_cols = df.columns[df.columns.str.contains(
                    '_significance')]
                truemask = df.loc[:, significance_cols] < alpha
                falsemask = df.loc[:, significance_cols] >= alpha

                df.loc[:, significance_cols] = np.select([truemask, falsemask],
                                                         [True, False],
                                                         default=None)

            # order new calculated metric columns: disparity, ref_group, then
            # significance for each
            base_sig = ['label_value_significance', 'score_significance']

            new_cols = sorted(
                list(set(df.columns) - set(original_cols) - set(base_sig)))

            return df[original_cols.tolist() + base_sig + new_cols]
Esempio n. 2
0
    def get_disparity_min_metric(self,
                                 df,
                                 original_df,
                                 key_columns=None,
                                 input_group_metrics=None,
                                 fill_divbyzero=None,
                                 check_significance=None,
                                 alpha=5e-2,
                                 mask_significance=True,
                                 label_score_ref='fpr'):
        """
        Calculates several ratios using the group metrics value and dividing by
        the minimum group metric value among all groups defined by each attribute

        :param df: the resulting dataframe from the group get_crosstabs
        :param original_df: a dataframe containing a required 'score 'column
            and possible 'label_value' column.
        :param key_columns: optional, the key columns to use on joins.
            Defualts are 'model_id', 'score_threshold', 'attribute_name'.
        :param input_group_metrics: optional, the columns list corresponding to
            the group metrics for which we want to calculate disparity values
        :param fill_divbyzero: optional, fill value to use when divided by zero
        :param check_significance: Measures for which to determine statistical
            significance beyond label_value and score. Defaults are 'fpr' and 'fnr'.
        :param alpha: Level at which to determine statistical significance.
            Default is 5e-2 (0.05).
        :param mask_significance: Whether to display a T/F mask over calculated
            p-values from statistical significance determination. Default is True.
        :param label_score_ref: Default reference group to use for score and
            label_value statistical significance calculations

        :return: a dataframe with same number of rows as the input (crosstab)
            but with additional disparity metrics columns, ref_group_values, and
            statisitcal significance of specific metrics.
        """

        print('get_disparity_min_metric()')
        # record df column order
        df_cols = df.columns

        if not key_columns:
            key_columns = df_cols.intersection(self.key_columns).tolist()
        if not input_group_metrics:
            input_group_metrics = self.input_group_metrics
        if not fill_divbyzero:
            fill_divbyzero = self.fill_divbyzero
        if not check_significance:
            check_significance = self.significance_cols

        for group_metric in input_group_metrics:

            try:
                # this groupby is being called every cycle. maybe we can create
                # a list of df_groups and merge df at the end? it can not be
                # simply put outside the loop(the merge...)
                idxmin = df.groupby(key_columns)[group_metric].idxmin()

                # if entire column for a group metric is NaN, cast min value
                # index column to same index as any other group for that attribute
                if any(pd.np.isnan(val) for val in idxmin.values):
                    if (len(idxmin) >= 1):
                        idxmin.loc[idxmin.isna()] = df.loc[
                            df["attribute_name"].isin(
                                idxmin.index.get_level_values(
                                    'attribute_name').values)].index[0]
                    else:
                        logging.error(
                            f"A minimum value for group_metric "
                            f"{group_metric} could not be calculated.")
                        continue

                df_min_idx = df.loc[idxmin]

                # but we also want to get the group_value of the reference group
                # for each bias metric
                df_to_merge = pd.DataFrame()
                df_to_merge[key_columns + [group_metric + '_disparity', group_metric +
                                           '_ref_group_value']] = \
                    df_min_idx[key_columns + [group_metric, 'attribute_value']]
            except KeyError:
                logging.error(
                    'get_bias_min_metric:: one of the following columns is not '
                    'on the input dataframe : model_id, parameter, attribute_name '
                    'or any of the input_group_metrics '
                    'list')

            df = df.merge(df_to_merge, on=key_columns)
            # creating disparity by dividing each group metric value by the
            # corresponding min value from the groups of the target attribute
            df[group_metric + '_disparity'] = \
                df[group_metric] / df[group_metric + '_disparity']
            # We are capping the disparity values to 10.0 when divided by zero...
        df = df.replace(pd.np.inf, fill_divbyzero)

        # add statistical_significance
        check_significance = df_cols.intersection(check_significance).tolist()

        ref_groups_dict = assemble_ref_groups(
            df,
            ref_group_flag='_ref_group_value',
            specific_measures=check_significance,
            label_score_ref=label_score_ref)

        attr_cols = df['attribute_name'].unique()

        # run significance method on bias-augmented crosstab based on false
        # positives, false negatives, scores, and label values in original df
        self.get_statistical_significance(original_df,
                                          df,
                                          ref_dict=ref_groups_dict,
                                          score_thresholds=None,
                                          model_id=1,
                                          attr_cols=attr_cols,
                                          aplha=5e-2)

        # if specified, apply T/F mask to significance columns
        if mask_significance:
            significance_cols = df.columns[df.columns.str.contains(
                '_significance')]
            truemask = df.loc[:, significance_cols] < alpha
            falsemask = df.loc[:, significance_cols] >= alpha

            df.loc[:, significance_cols] = pd.np.select([truemask, falsemask],
                                                        [True, False],
                                                        default=None)

        # check what new disparity columns are and order as disparity,
        # ref_group, significance for each
        new_cols = sorted(
            list(
                set(df.columns) - set(df_cols) -
                {'label_value_significance', 'score_significance'}))

        return df[df_cols.tolist() +
                  ['label_value_significance', 'score_significance'] +
                  new_cols]
Esempio n. 3
0
    def get_disparity_major_group(self,
                                  df,
                                  original_df,
                                  key_columns=None,
                                  input_group_metrics=None,
                                  fill_divbyzero=None,
                                  check_significance=False,
                                  alpha=5e-2,
                                  mask_significance=True,
                                  selected_significance=False):
        """
        Calculates disparities between groups for the predefined list of group
        metrics using the majority group within each attribute as the reference
        group (denominator).

        :param df: output dataframe of Group class get_crosstabs() method.
        :param original_df: a dataframe of sample features and model results.
            Includes a required 'score 'column and possible 'label_value' column.
        :param key_columns: optional, key identifying columns for grouping
            variables and bias metrics in intermediate joins. Defaults are
            'model_id', 'score_threshold', 'attribute_name'.
        :param input_group_metrics: optional, columns list corresponding to
            the group metrics for which we want to calculate disparity values
        :param fill_divbyzero: optional, fill value to use when divided by
            zero. Default is None.
        :param check_significance: whether to determine statistical signifance
            for disparity metrics. Default is False.
        :param selected_significance: specific measures (beyond label_value and
            score) to which to limit statistical significance calculations when
            check_significance is True. Default is False, i.e., significance
            will be calculated for all metrics.
        :param alpha: statistical significance level to use in significance
            determination. Default is 5e-2 (0.05).
        :param mask_significance: whether to display a T/F mask over calculated
            p-values from statistical significance determination. Default is True.
        :return: A dataframe with same number of rows as the input (crosstab)
            with additional disparity metrics columns and ref_group_values
            for each metric.
        """
        print('get_disparity_major_group()')
        # record df column order
        original_cols = df.columns

        if not key_columns:
            key_columns = original_cols.intersection(self.key_columns).tolist()
        if not input_group_metrics:
            input_group_metrics = self.input_group_metrics
        if not fill_divbyzero:
            fill_divbyzero = self.fill_divbyzero

        try:
            df_major_group = df.loc[df.groupby(key_columns)
                                    ['group_size'].idxmax()]
        except KeyError:
            raise KeyError(
                'get_bias_major_group:: one of the following columns '
                'is not on the input dataframe : model_id, parameter, '
                'attribute_name, group_size')

        disparity_metrics = [col + '_disparity' for col in input_group_metrics]
        df_to_merge = pd.DataFrame()

        # we created the df_to_merge has a subset of the df_ref_group containing
        # the target ref group values which are now labeled as _disparity but
        # we still need to perform the division
        df_to_merge[key_columns +
                    disparity_metrics] = df_major_group[key_columns +
                                                        input_group_metrics]

        # we now need to create the ref_group_value columns in the df_to_merge
        for col in input_group_metrics:
            df_to_merge[col +
                        '_ref_group_value'] = df_major_group['attribute_value']
        df = df.merge(df_to_merge, on=key_columns)
        df[disparity_metrics] = df[input_group_metrics].divide(
            df[disparity_metrics].values)

        # We are capping the disparity values to 10.0 when divided by zero...
        df = df.replace(np.inf, fill_divbyzero)

        # when there is a zero in the numerator and a zero in denominator it is
        # considered NaN after division, so if 0/0 we assume 1.0 disparity
        # (they are the same...)

        # default is to use the same ref groups as df, need to add functionality to
        # compile ref_groups_dict based on a passed ref group for a given measure
        if not check_significance:
            return df

        else:
            if not selected_significance:
                selected_significance = self.input_group_metrics

            # only proceed with columns actually in dataframe
            selected_significance = set(
                original_cols.intersection(selected_significance))

            # always includes label and score significance
            selected_significance = selected_significance.union(
                {'label_value', 'score'})

            ref_groups_dict = assemble_ref_groups(
                df,
                ref_group_flag='_ref_group_value',
                specific_measures=selected_significance,
                label_score_ref=None)

            ref_groups_dict = assemble_ref_groups(
                df,
                ref_group_flag='_ref_group_value',
                specific_measures=selected_significance,
                label_score_ref=None)

            attr_cols = df['attribute_name'].unique()

            for attribute in attr_cols:
                largest_group = df_major_group.loc[
                    df_major_group['attribute_name'] == attribute,
                    'attribute_value'].values.tolist()[0]
                ref_groups_dict[attribute]['label_value'] = largest_group
                ref_groups_dict[attribute]['score'] = largest_group

            # run significance method on bias-augmented crosstab based on false
            # positives, false negatives, scores, and label values in original df
            self._get_statistical_significance(
                original_df,
                df,
                ref_dict=ref_groups_dict,
                score_thresholds=None,
                attr_cols=attr_cols,
                alpha=5e-2,
                selected_significance=selected_significance)

            # if specified, apply T/F mask to significance columns
            if mask_significance:
                significance_cols = df.columns[df.columns.str.contains(
                    '_significance')]
                truemask = df.loc[:, significance_cols] < alpha
                falsemask = df.loc[:, significance_cols] >= alpha

                df.loc[:, significance_cols] = np.select([truemask, falsemask],
                                                         [True, False],
                                                         default=None)

            # check what new disparity columns are and order as disparity,
            # ref_group, significance for each
            base_sig = ['label_value_significance', 'score_significance']

            new_cols = sorted(
                list(set(df.columns) - set(original_cols) - set(base_sig)))

            return df[original_cols.tolist() + base_sig + new_cols]
Esempio n. 4
0
    def get_disparity_major_group(self,
                                  df,
                                  original_df,
                                  key_columns=None,
                                  input_group_metrics=None,
                                  fill_divbyzero=None,
                                  check_significance=None,
                                  alpha=5e-2,
                                  mask_significance=True,
                                  label_score_ref='fpr'):
        """
        Calculates the bias (disparity) metrics for the predefined list of group
        metrics using the majority group within each attribute as the reference
        group (denominator)

        :param df: the returning dataframe from the group.get_crosstabs
        :param original_df: a dataframe containing a required 'score 'column
            and possible 'label_value' column.
        :param key_columns: optional, the key columns to use on joins.
            Defaults are 'model_id', 'score_threshold', 'attribute_name'.
        :param input_group_metrics: optional, the columns list corresponding to
            the group metrics for which we want to calculate disparity values
        :param fill_divbyzero: optional, fill value to use when divided by zero
        :param check_significance: Measures for which to determine statistical
            significance beyond label_value and score. Defaults are 'fpr' and 'fnr'.
        :param alpha: Level at which to determine statistical significance.
            Default is 5e-2 (0.05).
        :param mask_significance: Whether to display a T/F mask over calculated
            p-values from statistical significance determination. Default is True.
        :param label_score_ref: Metric which eference group to use for score and
            label_value statistical significance calculations.

        :return: a dataframe with same number of rows as the input (crosstab)
            but with additional disparity metrics columns, ref_group_values, and
            statisitcal significance of specific metrics.
        """
        print('get_disparity_major_group()')
        # record df column order
        df_cols = df.columns

        if not key_columns:
            key_columns = df_cols.intersection(self.key_columns).tolist()
        if not input_group_metrics:
            input_group_metrics = self.input_group_metrics
        if not fill_divbyzero:
            fill_divbyzero = self.fill_divbyzero
        if not check_significance:
            check_significance = self.significance_cols

        try:
            df_major_group = df.loc[df.groupby(key_columns)
                                    ['group_size'].idxmax()]
        except KeyError:
            logging.error(
                'get_bias_major_group:: one of the following columns '
                'is not on the input dataframe : model_id, parameter, '
                'attribute_name, group_size')

        disparity_metrics = [col + '_disparity' for col in input_group_metrics]
        df_to_merge = pd.DataFrame()
        # we created the df_to_merge has a subset of the df_ref_group containing
        # the target ref group values which are now labeled as _disparity but
        # we still need to perform the division
        df_to_merge[key_columns +
                    disparity_metrics] = df_major_group[key_columns +
                                                        input_group_metrics]
        # we now need to create the ref_group_value columns in the df_to_merge
        for col in input_group_metrics:
            df_to_merge[col +
                        '_ref_group_value'] = df_major_group['attribute_value']
        df = df.merge(df_to_merge, on=key_columns)
        df[disparity_metrics] = df[input_group_metrics].divide(
            df[disparity_metrics].values)
        # We are capping the disparity values to 10.0 when divided by zero...
        df = df.replace(pd.np.inf, fill_divbyzero)

        # when there is a zero in the numerator and a zero in denominator it is
        # considered NaN after division, so if 0/0 we assume 1.0 disparity
        # (they are the same...)

        fill_zeros = {metric: 1.000000 for metric in disparity_metrics}
        # df = df.fillna(value=fill_zeros)

        # default is to use the same ref groups as df, need to add functionality to
        # complie ref_groups_dict based on a passed ref group for a given measure
        check_significance = df_cols.intersection(check_significance).tolist()

        ref_groups_dict = assemble_ref_groups(
            df,
            ref_group_flag='_ref_group_value',
            specific_measures=check_significance,
            label_score_ref=label_score_ref)

        attr_cols = df['attribute_name'].unique()
        for attribute in attr_cols:
            largest_group = df_major_group.loc[
                df_major_group['attribute_name'] == attribute,
                'attribute_value'].values.tolist()[0]
            ref_groups_dict[attribute]['label_value'] = largest_group
            ref_groups_dict[attribute]['score'] = largest_group

        # run significance method on bias-augmented crosstab based on false
        # positives, false negatives, scores, and label values in original df
        self.get_statistical_significance(original_df,
                                          df,
                                          ref_dict=ref_groups_dict,
                                          score_thresholds=None,
                                          model_id=1,
                                          attr_cols=attr_cols,
                                          aplha=5e-2)

        # if specified, apply T/F mask to significance columns
        if mask_significance:
            significance_cols = df.columns[df.columns.str.contains(
                '_significance')]
            truemask = df.loc[:, significance_cols] < alpha
            falsemask = df.loc[:, significance_cols] >= alpha

            df.loc[:, significance_cols] = pd.np.select([truemask, falsemask],
                                                        [True, False],
                                                        default=None)

        # check what new disparity columns are and order as disparity,
        # ref_group, significance for each
        new_cols = sorted(
            list(
                set(df.columns) - set(df_cols) -
                {'label_value_significance', 'score_significance'}))
        return df[df_cols.tolist() +
                  ['label_value_significance', 'score_significance'] +
                  new_cols]