Example #1
0
    def predict(self, dataset):
        """Perturb the predicted labels to obtain new labels that satisfy
        equalized odds constraints.

        Args:
            dataset (BinaryLabelDataset): Dataset containing labels that needs
                to be transformed.
            dataset (BinaryLabelDataset): Transformed dataset.
        """
        if self.seed is not None:
            np.random.seed(self.seed)

        # Get the model parameters output from fit
        sp2p, sn2p, op2p, on2p = self.model_params.x

        # Create boolean conditioning vectors for protected groups
        cond_vec_priv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.privileged_groups)
        cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.unprivileged_groups)

        # Randomly flip labels according to the probabilities in model_params
        self_fair_pred = dataset.labels[cond_vec_priv].copy()
        self_pp_indices, _ = np.nonzero(
            dataset.labels[cond_vec_priv] == dataset.favorable_label)
        self_pn_indices, _ = np.nonzero(
            dataset.labels[cond_vec_priv] == dataset.unfavorable_label)
        np.random.shuffle(self_pp_indices)
        np.random.shuffle(self_pn_indices)

        n2p_indices = self_pn_indices[:int(len(self_pn_indices) * sn2p)]
        self_fair_pred[n2p_indices] = dataset.favorable_label
        p2n_indices = self_pp_indices[:int(len(self_pp_indices) * (1 - sp2p))]
        self_fair_pred[p2n_indices] = dataset.unfavorable_label

        othr_fair_pred = dataset.labels[cond_vec_unpriv].copy()
        othr_pp_indices, _ = np.nonzero(
            dataset.labels[cond_vec_unpriv] == dataset.favorable_label)
        othr_pn_indices, _ = np.nonzero(
            dataset.labels[cond_vec_unpriv] == dataset.unfavorable_label)
        np.random.shuffle(othr_pp_indices)
        np.random.shuffle(othr_pn_indices)

        n2p_indices = othr_pn_indices[:int(len(othr_pn_indices) * on2p)]
        othr_fair_pred[n2p_indices] = dataset.favorable_label
        p2n_indices = othr_pp_indices[:int(len(othr_pp_indices) * (1 - op2p))]
        othr_fair_pred[p2n_indices] = dataset.unfavorable_label

        # Mutated, fairer dataset with new labels
        dataset_new = dataset.copy()

        new_labels = np.zeros_like(dataset.labels, dtype=np.float64)
        new_labels[cond_vec_priv] = self_fair_pred
        new_labels[cond_vec_unpriv] = othr_fair_pred

        dataset_new.labels = new_labels

        return dataset_new
    def __init__(self,
                 dataset,
                 unprivileged_groups=None,
                 privileged_groups=None):
        """
        Args:
            dataset (StructuredDataset): A StructuredDataset.
            privileged_groups (list(dict)): Privileged groups. Format is a list
                of `dicts` where the keys are `protected_attribute_names` and
                the values are values in `protected_attributes`. Each `dict`
                element describes a single group. See examples for more details.
            unprivileged_groups (list(dict)): Unprivileged groups in the same
                format as `privileged_groups`.

        Raises:
            TypeError: `dataset` must be a
                :obj:`~aiflearn.datasets.StructuredDataset` type.
            ValueError: `privileged_groups` and `unprivileged_groups` must be
                disjoint.

        Examples:
            >>> from aiflearn.datasets import GermanDataset
            >>> german = GermanDataset()
            >>> u = [{'sex': 1, 'age': 1}, {'sex': 0}]
            >>> p = [{'sex': 1, 'age': 0}]
            >>> dm = DatasetMetric(german, unprivileged_groups=u, privileged_groups=p)
        """
        if not isinstance(dataset, StructuredDataset):
            raise TypeError("'dataset' should be a StructuredDataset")

        # sets self.dataset
        super(DatasetMetric, self).__init__(dataset)

        # TODO: should this deepcopy?
        self.privileged_groups = privileged_groups
        self.unprivileged_groups = unprivileged_groups

        # don't check if nothing was provided
        if not self.privileged_groups or not self.unprivileged_groups:
            return

        priv_mask = utils.compute_boolean_conditioning_vector(
            self.dataset.protected_attributes,
            self.dataset.protected_attribute_names, self.privileged_groups)
        unpriv_mask = utils.compute_boolean_conditioning_vector(
            self.dataset.protected_attributes,
            self.dataset.protected_attribute_names, self.unprivileged_groups)
        if np.any(np.logical_and(priv_mask, unpriv_mask)):
            raise ValueError("'privileged_groups' and 'unprivileged_groups'"
                             " must be disjoint.")
        if not np.all(np.logical_or(priv_mask, unpriv_mask)):
            warning("There are some instances in the dataset which are not "
                    "designated as either privileged or unprivileged. Are you "
                    "sure this is right?")
    def _between_group_generalized_entropy_index(self, groups, alpha=2):
        r"""Between-group generalized entropy index is proposed as a group
        fairness measure in [2]_ and is one of two terms that the generalized
        entropy index decomposes to.

        Args:
            groups (list): A list of groups over which to calculate this metric.
                Groups should be disjoint. By default, this will use the
                `privileged_groups` and `unprivileged_groups` as the only two
                groups.
            alpha (int): See :meth:`generalized_entropy_index`.

        References:
            .. [2] T. Speicher, H. Heidari, N. Grgic-Hlaca, K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar,
               "A Unified Approach to Quantifying Algorithmic Unfairness: Measuring Individual and Group Unfairness via Inequality Indices,"
               ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2018.
        """
        b = np.zeros(self.dataset.labels.size, dtype=np.float64)

        for group in groups:
            classified_group = utils.compute_boolean_conditioning_vector(
                self.classified_dataset.protected_attributes,
                self.classified_dataset.protected_attribute_names,
                condition=group)
            true_group = utils.compute_boolean_conditioning_vector(
                self.dataset.protected_attributes,
                self.dataset.protected_attribute_names,
                condition=group)
            # ignore if there are no members of this group present
            if not np.any(true_group):
                continue
            y_pred = self.classified_dataset.labels[classified_group].ravel()
            y_true = self.dataset.labels[true_group].ravel()
            y_pred = (
                y_pred == self.classified_dataset.favorable_label).astype(
                    np.float64)
            y_true = (y_true == self.dataset.favorable_label).astype(
                np.float64)
            b[true_group] = np.mean(1 + y_pred - y_true)

        if alpha == 1:
            return np.mean(np.log((b / np.mean(b))**b) / np.mean(b))
        elif alpha == 0:
            return -np.mean(np.log(b / np.mean(b)) / np.mean(b))
        else:
            return np.mean((b / np.mean(b))**alpha - 1) / (alpha * (alpha - 1))
    def predict(self, dataset):
        """Obtain fair predictions using the ROC method.

        Args:
            dataset (BinaryLabelDataset): Dataset containing scores that will
                be used to compute predicted labels.

        Returns:
            dataset_pred (BinaryLabelDataset): Output dataset with potentially
            fair predictions obtain using the ROC method.
        """
        dataset_new = dataset.copy(deepcopy=False)

        fav_pred_inds = (dataset.scores > self.classification_threshold)
        unfav_pred_inds = ~fav_pred_inds

        y_pred = np.zeros(dataset.scores.shape)
        y_pred[fav_pred_inds] = dataset.favorable_label
        y_pred[unfav_pred_inds] = dataset.unfavorable_label

        # Indices of critical region around the classification boundary
        crit_region_inds = np.logical_and(
            dataset.scores <= self.classification_threshold + self.ROC_margin,
            dataset.scores > self.classification_threshold - self.ROC_margin)

        # Indices of privileged and unprivileged groups
        cond_priv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.privileged_groups)
        cond_unpriv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.unprivileged_groups)

        # New, fairer labels
        dataset_new.labels = y_pred
        dataset_new.labels[np.logical_and(crit_region_inds,
                                          cond_priv.reshape(
                                              -1,
                                              1))] = dataset.unfavorable_label
        dataset_new.labels[np.logical_and(crit_region_inds,
                                          cond_unpriv.reshape(
                                              -1,
                                              1))] = dataset.favorable_label

        return dataset_new
Example #5
0
    def predict(self, dataset):
        """Perturb the predicted scores to obtain new labels that satisfy
        equalized odds constraints, while preserving calibration.

        Args:
            dataset (BinaryLabelDataset): Dataset containing `scores` that needs
                to be transformed.
        Returns:
            dataset (BinaryLabelDataset): transformed dataset.
        """
        if self.seed is not None:
            np.random.seed(self.seed)

        cond_vec_priv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.privileged_groups)
        cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes, dataset.protected_attribute_names,
            self.unprivileged_groups)

        priv_indices = (np.random.random(sum(cond_vec_priv)) <=
                        self.priv_mix_rate)
        priv_new_pred = dataset.scores[cond_vec_priv].copy()
        priv_new_pred[priv_indices] = self.base_rate_priv

        unpriv_indices = (np.random.random(sum(cond_vec_unpriv)) <=
                          self.unpriv_mix_rate)
        unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy()
        unpriv_new_pred[unpriv_indices] = self.base_rate_unpriv

        dataset_new = dataset.copy(deepcopy=True)

        dataset_new.scores = np.zeros_like(dataset.scores, dtype=np.float64)
        dataset_new.scores[cond_vec_priv] = priv_new_pred
        dataset_new.scores[cond_vec_unpriv] = unpriv_new_pred

        # Create labels from scores using a default threshold
        dataset_new.labels = np.where(dataset_new.scores >= self.threshold,
                                      dataset_new.favorable_label,
                                      dataset_new.unfavorable_label)
        return dataset_new
Example #6
0
    def _obtain_conditionings(self, dataset):
        """Obtain the necessary conditioning boolean vectors to compute
        instance level weights.
        """
        # conditioning
        priv_cond = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes,
            dataset.protected_attribute_names,
            condition=self.privileged_groups)
        unpriv_cond = utils.compute_boolean_conditioning_vector(
            dataset.protected_attributes,
            dataset.protected_attribute_names,
            condition=self.unprivileged_groups)
        fav_cond = dataset.labels.ravel() == dataset.favorable_label
        unfav_cond = dataset.labels.ravel() == dataset.unfavorable_label

        # combination of label and privileged/unpriv. groups
        cond_p_fav = np.logical_and(fav_cond, priv_cond)
        cond_p_unfav = np.logical_and(unfav_cond, priv_cond)
        cond_up_fav = np.logical_and(fav_cond, unpriv_cond)
        cond_up_unfav = np.logical_and(unfav_cond, unpriv_cond)

        return (priv_cond, unpriv_cond, fav_cond, unfav_cond, cond_p_fav,
                cond_p_unfav, cond_up_fav, cond_up_unfav)
Example #7
0
    def fit(self, dataset_true, dataset_pred):
        """Compute parameters for equalizing odds using true and predicted
        labels.

        Args:
            true_dataset (BinaryLabelDataset): Dataset containing true labels.
            pred_dataset (BinaryLabelDataset): Dataset containing predicted
                labels.

        Returns:
            EqOddsPostprocessing: Returns self.
        """
        metric = ClassificationMetric(
            dataset_true,
            dataset_pred,
            unprivileged_groups=self.unprivileged_groups,
            privileged_groups=self.privileged_groups)

        # compute basic statistics
        sbr = metric.num_instances(privileged=True) / metric.num_instances()
        obr = metric.num_instances(privileged=False) / metric.num_instances()

        fpr0 = metric.false_positive_rate(privileged=True)
        fpr1 = metric.false_positive_rate(privileged=False)
        fnr0 = metric.false_negative_rate(privileged=True)
        fnr1 = metric.false_negative_rate(privileged=False)
        tpr0 = metric.true_positive_rate(privileged=True)
        tpr1 = metric.true_positive_rate(privileged=False)
        tnr0 = metric.true_negative_rate(privileged=True)
        tnr1 = metric.true_negative_rate(privileged=False)

        # linear program has 4 decision variables:
        # [Pr[label_tilde = 1 | label_hat = 1, protected_attributes = 0];
        #  Pr[label_tilde = 1 | label_hat = 0, protected_attributes = 0];
        #  Pr[label_tilde = 1 | label_hat = 1, protected_attributes = 1];
        #  Pr[label_tilde = 1 | label_hat = 0, protected_attributes = 1]]
        # Coefficients of the linear objective function to be minimized.
        c = np.array([fpr0 - tpr0, tnr0 - fnr0, fpr1 - tpr1, tnr1 - fnr1])

        # A_ub - 2-D array which, when matrix-multiplied by x, gives the values
        # of the upper-bound inequality constraints at x
        # b_ub - 1-D array of values representing the upper-bound of each
        # inequality constraint (row) in A_ub.
        # Just to keep these between zero and one
        A_ub = np.array(
            [[1, 0, 0, 0], [-1, 0, 0, 0], [0, 1, 0, 0], [0, -1, 0, 0],
             [0, 0, 1, 0], [0, 0, -1, 0], [0, 0, 0, 1], [0, 0, 0, -1]],
            dtype=np.float64)
        b_ub = np.array([1, 0, 1, 0, 1, 0, 1, 0], dtype=np.float64)

        # Create boolean conditioning vectors for protected groups
        cond_vec_priv = utils.compute_boolean_conditioning_vector(
            dataset_pred.protected_attributes,
            dataset_pred.protected_attribute_names, self.privileged_groups)
        cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
            dataset_pred.protected_attributes,
            dataset_pred.protected_attribute_names, self.unprivileged_groups)

        sconst = np.ravel(
            dataset_pred.labels[cond_vec_priv] == dataset_pred.favorable_label)
        sflip = np.ravel(dataset_pred.labels[cond_vec_priv] ==
                         dataset_pred.unfavorable_label)
        oconst = np.ravel(dataset_pred.labels[cond_vec_unpriv] ==
                          dataset_pred.favorable_label)
        oflip = np.ravel(dataset_pred.labels[cond_vec_unpriv] ==
                         dataset_pred.unfavorable_label)

        y_true = dataset_true.labels.ravel()

        sm_tn = np.logical_and(
            sflip,
            y_true[cond_vec_priv] == dataset_true.unfavorable_label,
            dtype=np.float64)
        sm_fn = np.logical_and(
            sflip,
            y_true[cond_vec_priv] == dataset_true.favorable_label,
            dtype=np.float64)
        sm_fp = np.logical_and(
            sconst,
            y_true[cond_vec_priv] == dataset_true.unfavorable_label,
            dtype=np.float64)
        sm_tp = np.logical_and(
            sconst,
            y_true[cond_vec_priv] == dataset_true.favorable_label,
            dtype=np.float64)

        om_tn = np.logical_and(
            oflip,
            y_true[cond_vec_unpriv] == dataset_true.unfavorable_label,
            dtype=np.float64)
        om_fn = np.logical_and(
            oflip,
            y_true[cond_vec_unpriv] == dataset_true.favorable_label,
            dtype=np.float64)
        om_fp = np.logical_and(
            oconst,
            y_true[cond_vec_unpriv] == dataset_true.unfavorable_label,
            dtype=np.float64)
        om_tp = np.logical_and(
            oconst,
            y_true[cond_vec_unpriv] == dataset_true.favorable_label,
            dtype=np.float64)

        # A_eq - 2-D array which, when matrix-multiplied by x,
        # gives the values of the equality constraints at x
        # b_eq - 1-D array of values representing the RHS of each equality
        # constraint (row) in A_eq.
        # Used to impose equality of odds constraint
        A_eq = [
            [(np.mean(sconst * sm_tp) - np.mean(sflip * sm_tp)) / sbr,
             (np.mean(sflip * sm_fn) - np.mean(sconst * sm_fn)) / sbr,
             (np.mean(oflip * om_tp) - np.mean(oconst * om_tp)) / obr,
             (np.mean(oconst * om_fn) - np.mean(oflip * om_fn)) / obr],
            [(np.mean(sconst * sm_fp) - np.mean(sflip * sm_fp)) / (1 - sbr),
             (np.mean(sflip * sm_tn) - np.mean(sconst * sm_tn)) / (1 - sbr),
             (np.mean(oflip * om_fp) - np.mean(oconst * om_fp)) / (1 - obr),
             (np.mean(oconst * om_tn) - np.mean(oflip * om_tn)) / (1 - obr)]
        ]

        b_eq = [
            (np.mean(oflip * om_tp) + np.mean(oconst * om_fn)) / obr -
            (np.mean(sflip * sm_tp) + np.mean(sconst * sm_fn)) / sbr,
            (np.mean(oflip * om_fp) + np.mean(oconst * om_tn)) / (1 - obr) -
            (np.mean(sflip * sm_fp) + np.mean(sconst * sm_tn)) / (1 - sbr)
        ]

        # Linear program
        self.model_params = linprog(c,
                                    A_ub=A_ub,
                                    b_ub=b_ub,
                                    A_eq=A_eq,
                                    b_eq=b_eq)

        return self
Example #8
0
    def fit(self, dataset_true, dataset_pred):
        """Compute parameters for equalizing generalized odds using true and
        predicted scores, while preserving calibration.

        Args:
            dataset_true (BinaryLabelDataset): Dataset containing true `labels`.
            dataset_pred (BinaryLabelDataset): Dataset containing predicted
                `scores`.

        Returns:
            CalibratedEqOddsPostprocessing: Returns self.
        """

        # Create boolean conditioning vectors for protected groups
        cond_vec_priv = utils.compute_boolean_conditioning_vector(
            dataset_pred.protected_attributes,
            dataset_pred.protected_attribute_names, self.privileged_groups)
        cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
            dataset_pred.protected_attributes,
            dataset_pred.protected_attribute_names, self.unprivileged_groups)

        cm = ClassificationMetric(dataset_true,
                                  dataset_pred,
                                  unprivileged_groups=self.unprivileged_groups,
                                  privileged_groups=self.privileged_groups)
        self.base_rate_priv = cm.base_rate(privileged=True)
        self.base_rate_unpriv = cm.base_rate(privileged=False)

        # Create a dataset with "trivial" predictions
        dataset_trivial = dataset_pred.copy(deepcopy=True)
        dataset_trivial.scores[cond_vec_priv] = cm.base_rate(privileged=True)
        dataset_trivial.scores[cond_vec_unpriv] = cm.base_rate(
            privileged=False)
        cm_triv = ClassificationMetric(
            dataset_true,
            dataset_trivial,
            unprivileged_groups=self.unprivileged_groups,
            privileged_groups=self.privileged_groups)

        if self.fn_rate == 0:
            priv_cost = cm.generalized_false_positive_rate(privileged=True)
            unpriv_cost = cm.generalized_false_positive_rate(privileged=False)
            priv_trivial_cost = cm_triv.generalized_false_positive_rate(
                privileged=True)
            unpriv_trivial_cost = cm_triv.generalized_false_positive_rate(
                privileged=False)

        elif self.fp_rate == 0:
            priv_cost = cm.generalized_false_negative_rate(privileged=True)
            unpriv_cost = cm.generalized_false_negative_rate(privileged=False)
            priv_trivial_cost = cm_triv.generalized_false_negative_rate(
                privileged=True)
            unpriv_trivial_cost = cm_triv.generalized_false_negative_rate(
                privileged=False)

        else:
            priv_cost = weighted_cost(self.fp_rate,
                                      self.fn_rate,
                                      cm,
                                      privileged=True)
            unpriv_cost = weighted_cost(self.fp_rate,
                                        self.fn_rate,
                                        cm,
                                        privileged=False)
            priv_trivial_cost = weighted_cost(self.fp_rate,
                                              self.fn_rate,
                                              cm_triv,
                                              privileged=True)
            unpriv_trivial_cost = weighted_cost(self.fp_rate,
                                                self.fn_rate,
                                                cm_triv,
                                                privileged=False)

        unpriv_costs_more = unpriv_cost > priv_cost
        self.priv_mix_rate = (unpriv_cost - priv_cost) / (
            priv_trivial_cost - priv_cost) if unpriv_costs_more else 0
        self.unpriv_mix_rate = 0 if unpriv_costs_more else (
            priv_cost - unpriv_cost) / (unpriv_trivial_cost - unpriv_cost)

        return self