Ejemplo n.º 1
0
def bias_grid(Y):
    num_stages = Y.columns.size - 1
    f, axes = plt.subplots(1, num_stages, figsize=(2+4*num_stages, 12), squeeze=True, sharey=True)
    for ax, stage, prev in zip(axes, Y.columns[1:], Y.columns):
        rates = Y[Y[prev]][stage].groupby(level=['race', 'gender']).apply(base_rate)
        sns.heatmap(rates.unstack(), annot=True, fmt='.1%', cmap='RdBu',
                    center=base_rate(Y[Y[prev]][stage]), robust=True,
                    cbar=False, square=True, ax=ax);
        ax.set_title(f'{prev} -> {stage}')
    plt.close()
    return f
Ejemplo n.º 2
0
 def _weighted_cost(self, y_true, probas_pred, pos_label=1,
                    sample_weight=None):
     """Evaluates the cost function specified by ``self.cost_constraint``."""
     fpr = generalized_fpr(y_true, probas_pred, pos_label, sample_weight)
     fnr = generalized_fnr(y_true, probas_pred, pos_label, sample_weight)
     br = base_rate(y_true, probas_pred, pos_label, sample_weight)
     if self.cost_constraint == 'fpr':
         return fpr
     elif self.cost_constraint == 'fnr':
         return fnr
     elif self.cost_constraint == 'weighted':
         return fpr * (1 - br) + fnr * br
     else:
         raise ValueError("`cost_constraint` must be one of: 'fpr', 'fnr', "
                          "or 'weighted'")
Ejemplo n.º 3
0
    def fit(self, X, y, labels=None, pos_label=1, sample_weight=None):
        """Compute the mixing rates required to satisfy the cost constraint.

        Args:
            X (array-like): Probability estimates of the targets as returned by
                a ``predict_proba()`` call or equivalent.
            y (pandas.Series): Ground-truth (correct) target values.
            labels (list, optional): The ordered set of labels values. Must
                match the order of columns in X if provided. By default,
                all labels in y are used in sorted order.
            pos_label (scalar, optional): The label of the positive class.
            sample_weight (array-like, optional): Sample weights.

        Returns:
            self
        """
        X, y, sample_weight = check_inputs(X, y, sample_weight)
        groups, self.prot_attr_ = check_groups(y, self.prot_attr,
                                               ensure_binary=True)
        self.classes_ = labels if labels is not None else np.unique(y)
        self.groups_ = np.unique(groups)
        self.pos_label_ = pos_label

        if len(self.classes_) > 2:
            raise ValueError('Only binary classification is supported.')

        if pos_label not in self.classes_:
            raise ValueError('pos_label={} is not in the set of labels. The '
                    'valid values are:\n{}'.format(pos_label, self.classes_))

        X = X[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]]

        # local function to return corresponding args for metric evaluation
        def _args(grp_idx, triv=False):
            idx = (groups == self.groups_[grp_idx])
            pred = np.full_like(X, self.base_rates_[grp_idx]) if triv else X
            return [y[idx], pred[idx], pos_label, sample_weight[idx]]

        self.base_rates_ = [base_rate(*_args(i)) for i in range(2)]

        costs = [self._weighted_cost(*_args(i)) for i in range(2)]
        self.mix_rates_ = [(costs[1] - costs[0])
                         / (self._weighted_cost(*_args(0, True)) - costs[0]),
                           (costs[0] - costs[1])
                         / (self._weighted_cost(*_args(1, True)) - costs[1])]
        self.mix_rates_[np.argmax(costs)] = 0

        return self
Ejemplo n.º 4
0
def test_base_rate():
    """Tests that the old and new base_rate matches exactly."""
    base = base_rate(y, y_pred, sample_weight=sample_weight)
    assert base == cm.base_rate()