Ejemplo n.º 1
0
 def fit(self, x, y, **kwargs):
     _, y_train, sensitive_features = _validate_and_reformat_input(
         x, y, enforce_binary_labels=False, **kwargs)
     if self.loss == "square":  # squared loss reweighting
         X, A, Y, W = augment.augment_data_sq(x, sensitive_features,
                                              y_train, self.Theta)
     elif self.loss == "absolute":  # absolute loss reweighting (uniform)
         X, A, Y, W = augment.augment_data_ab(x, sensitive_features,
                                              y_train, self.Theta)
     elif self.loss == "logistic":  # logisitic reweighting
         X, A, Y, W = augment.augment_data_logistic(x, sensitive_features,
                                                    y_train, self.Theta)
     else:
         raise Exception('Loss not supported: ', str(loss))
     if self.constraints == "DP":  # DP constraint
         self.constraints = DemographicParity_Theta()
         self.expgrad = ExponentiatedGradient(self.estimator,
                                              self.constraints,
                                              self.eps,
                                              error_weights=W)
         self.expgrad.fit(X, Y, sensitive_features=A)
         self.weights_ = self.expgrad.weights_
         self.best_classifier = lambda X: _mean_pred(
             X, self.expgrad._hs, self.expgrad.weights_)
         self._hs = self.expgrad._hs
         self.predictors_ = self.expgrad.predictors_
         self.best_gap_ = self.expgrad.best_gap_
         self.last_iter_ = self.expgrad.last_iter_
         self.best_iter_ = self.expgrad.best_iter_
         self.n_oracle_calls_ = self.expgrad.n_oracle_calls_
         self.n_classifiers = len(self._hs)
     else:  # exception
         raise Exception('Constraint not supported: ', str(constraint))
Ejemplo n.º 2
0
    def _pmf_predict(self, X, *, sensitive_features):
        """Probabilistic mass function.

        :param X: Feature matrix
        :type X: numpy.ndarray or pandas.DataFrame
        :param sensitive_features: Sensitive features to identify groups by, currently allows
            only a single column
        :type sensitive_features: Currently 1D array as numpy.ndarray, list, pandas.DataFrame,
            or pandas.Series
        :return: array of tuples with probabilities for predicting 0 or 1, respectively. The sum
            of the two numbers in each tuple needs to add up to 1.
        :rtype: numpy.ndarray
        """
        check_is_fitted(self)
        base_predictions = np.array(self.estimator_.predict(X))
        _, base_predictions_vector, sensitive_feature_vector = _validate_and_reformat_input(
            X,
            y=base_predictions,
            sensitive_features=sensitive_features,
            expect_y=True,
            enforce_binary_labels=False)

        positive_probs = 0.0 * base_predictions_vector
        for a, interpolation in self.interpolation_dict.items():
            interpolated_predictions = \
                interpolation.p0 * interpolation.operation0(base_predictions_vector) + \
                interpolation.p1 * interpolation.operation1(base_predictions_vector)
            if 'p_ignore' in interpolation:
                interpolated_predictions = \
                    interpolation.p_ignore * interpolation.prediction_constant + \
                    (1 - interpolation.p_ignore) * interpolated_predictions
            positive_probs[sensitive_feature_vector == a] = \
                interpolated_predictions[sensitive_feature_vector == a]
        return np.array([1.0 - positive_probs, positive_probs]).transpose()
Ejemplo n.º 3
0
    def load_data(self, X, y, *, sensitive_features):
        """Load data into the moment object."""
        X_train, y_train, sf_train, _ = \
            _validate_and_reformat_input(X, y,
                                         enforce_binary_labels=False,
                                         sensitive_features=sensitive_features)
        if self.no_groups:
            sf_train = y_train.apply(lambda v: _ALL)

        # The following uses X and not X_train so that the estimators get X untouched
        super().load_data(X, y_train, sensitive_features=sf_train)
        self.prob_attr = self.tags.groupby(_GROUP_ID).size() / self.total_samples
        self.index = self.prob_attr.index
        self.default_objective_lambda_vec = self.prob_attr

        # fill in the information about the basis
        attr_vals = self.tags[_GROUP_ID].unique()
        self.pos_basis = pd.DataFrame()
        self.neg_basis = pd.DataFrame()
        self.neg_basis_present = pd.Series(dtype='float64')
        zero_vec = pd.Series(0.0, self.index)
        i = 0
        for attr in attr_vals:
            self.pos_basis[i] = 0 + zero_vec
            self.neg_basis[i] = 0 + zero_vec
            self.pos_basis[i][attr] = 1
            self.neg_basis_present.at[i] = False
            i += 1
Ejemplo n.º 4
0
    def predict(self, X, *, sensitive_features, random_state=None):
        """Predict label for each sample in X while taking into account sensitive features.

        :param X: feature matrix
        :type X: numpy.ndarray or pandas.DataFrame
        :param sensitive_features: sensitive features to identify groups by, currently allows
            only a single column
        :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame,
            or pandas.Series
        :param random_state: set to a constant for reproducibility
        :type random_state: int
        :return: predictions in numpy.ndarray
        """
        if random_state:
            random.seed(random_state)

        self._validate_post_processed_predictor_is_fitted()
        _, _, sensitive_feature_vector = _validate_and_reformat_input(
            X,
            y=None,
            sensitive_features=sensitive_features,
            expect_y=False,
            enforce_binary_labels=True)
        unconstrained_predictions = self._unconstrained_predictor.predict(X)

        positive_probs = _vectorized_prediction(
            self._post_processed_predictor_by_sensitive_feature,
            sensitive_feature_vector, unconstrained_predictions)
        return (positive_probs >= np.random.rand(len(positive_probs))) * 1
Ejemplo n.º 5
0
    def fit(self, X, y, *, sensitive_features, **kwargs):
        """Fit the model.

        The fit is based on training features and labels, sensitive features,
        as well as the fairness-unaware predictor or estimator. If an estimator was passed
        in the constructor this fit method will call `fit(X, y, **kwargs)` on said estimator.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame
        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        :param sensitive_features: sensitive features to identify groups by, currently allows
            only a single column
        :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame,
            or pandas.Series
        """
        if self.estimator is None:
            raise ValueError(ESTIMATOR_ERROR_MESSAGE)

        if self.constraints not in _SUPPORTED_CONSTRAINTS:
            raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE)

        _, _, sensitive_feature_vector = _validate_and_reformat_input(
            X,
            y,
            sensitive_features=sensitive_features,
            enforce_binary_labels=True)

        # postprocessing can't handle 0/1 as floating point numbers, so this converts it to int
        if type(y) in [np.ndarray, pd.DataFrame, pd.Series]:
            y = y.astype(int)
        else:
            y = [int(y_val) for y_val in y]

        if not self.prefit:
            self.estimator_ = clone(self.estimator).fit(X, y, **kwargs)
        else:
            try:
                check_is_fitted(self.estimator)
                self.estimator_ = self.estimator
            except NotFittedError:
                self.estimator_ = clone(self.estimator).fit(X, y, **kwargs)

        scores = self.estimator_.predict(X)
        threshold_optimization_method = None
        if self.constraints == DEMOGRAPHIC_PARITY:
            threshold_optimization_method = \
                self._threshold_optimization_demographic_parity
        elif self.constraints == EQUALIZED_ODDS:
            threshold_optimization_method = \
                self._threshold_optimization_equalized_odds
        else:
            raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE)

        self._post_processed_predictor_by_sensitive_feature = threshold_optimization_method(
            sensitive_feature_vector, y, scores, self.grid_size, self.flip)
Ejemplo n.º 6
0
 def load_data(self, X, y, *, sensitive_features, control_features=None):
     """Load the specified data into the object."""
     _, y_train, sf_train, cf_train = \
         _validate_and_reformat_input(X, y,
                                      enforce_binary_labels=True,
                                      sensitive_features=sensitive_features,
                                      control_features=control_features)
     # The following uses X  so that the estimators get X untouched
     super().load_data(X, y_train, sensitive_features=sf_train)
     self.index = [_ALL]
Ejemplo n.º 7
0
    def load_data(self, X, y, *, sensitive_features, control_features=None):
        """Load the specified data into the object."""
        _, y_train, sf_train, cf_train = \
            _validate_and_reformat_input(X, y,
                                         enforce_binary_labels=True,
                                         sensitive_features=sensitive_features,
                                         control_features=control_features)

        base_event = pd.Series(data=_ALL, index=y_train.index)
        event = _merge_event_and_control_columns(base_event, cf_train)
        super().load_data(X, y_train, event=event, sensitive_features=sf_train)
Ejemplo n.º 8
0
    def load_data(self, X, y, *, sensitive_features, control_features=None):
        """Load the specified data into the object."""
        _, y_train, sf_train, cf_train = \
            _validate_and_reformat_input(X, y,
                                         enforce_binary_labels=True,
                                         sensitive_features=sensitive_features,
                                         control_features=control_features)

        base_event = y_train.apply(lambda v: _LABEL + "=" + str(v))
        event = _merge_event_and_control_columns(base_event, cf_train)
        super().load_data(X, y_train, event=event, sensitive_features=sf_train)
Ejemplo n.º 9
0
    def load_data(self, X, y, *, sensitive_features, control_features=None):
        """Load the specified data into the object."""
        _, y_train, sf_train, cf_train = \
            _validate_and_reformat_input(X, y,
                                         enforce_binary_labels=True,
                                         sensitive_features=sensitive_features,
                                         control_features=control_features)

        # The `where` clause is used to put `pd.nan` on all values where `Y!=0`.
        base_event = y_train.apply(lambda v: _LABEL + "=" + str(v)).where(
            y_train == 0)
        event = _merge_event_and_control_columns(base_event, cf_train)
        super().load_data(X, y_train, event=event, sensitive_features=sf_train)
 def fit(self, x, y, **kwargs):
     _, y_train, sensitive_features = _validate_and_reformat_input(
         x, y, enforce_binary_labels=False, **kwargs)
     if self.loss == "square":  # squared loss reweighting
         X, A, Y, W = augment.augment_data_sq(x, sensitive_features,
                                              y_train, self.Theta)
     elif self.loss == "absolute":  # absolute loss reweighting (uniform)
         X, A, Y, W = augment.augment_data_ab(x, sensitive_features,
                                              y_train, self.Theta)
     elif self.loss == "logistic":  # logisitic reweighting
         X, A, Y, W = augment.augment_data_logistic(x, sensitive_features,
                                                    y_train, self.Theta)
     else:
         raise Exception('Loss not supported: ', str(loss))
     if self.constraints == "DP":  # DP constraint
         self.constraints = DemographicParity_Theta()
         self.grid_search = GridSearch(self.estimator, self.constraints,
                                       self.selection_rule,
                                       self.constraint_weight,
                                       self.grid_size, self.grid_limit,
                                       self.grid_offset, self.grid, W)
         self.grid_search.fit(X, Y, sensitive_features=A)
     else:  # exception
         raise Exception('Constraint not supported: ', str(constraint))
Ejemplo n.º 11
0
    def _pmf_predict(self, X, *, sensitive_features):
        """Probabilistic mass function.

        :param X: Feature matrix
        :type X: numpy.ndarray or pandas.DataFrame
        :param sensitive_features: Sensitive features to identify groups by, currently allows
            only a single column
        :type sensitive_features: Currently 1D array as numpy.ndarray, list, pandas.DataFrame,
            or pandas.Series
        :return: array of tuples with probabilities for predicting 0 or 1, respectively. The sum
            of the two numbers in each tuple needs to add up to 1.
        :rtype: numpy.ndarray
        """
        self._validate_post_processed_predictor_is_fitted()
        _, _, sensitive_feature_vector = _validate_and_reformat_input(
            X,
            y=None,
            sensitive_features=sensitive_features,
            expect_y=False,
            enforce_binary_labels=True)
        positive_probs = _vectorized_prediction(
            self._post_processed_predictor_by_sensitive_feature,
            sensitive_feature_vector, self._unconstrained_predictor.predict(X))
        return np.array([[1.0 - p, p] for p in positive_probs])
Ejemplo n.º 12
0
    def fit(self, X, y, **kwargs):
        """Run the grid search.

        This will result in multiple copies of the
        estimator being made, and the :code:`fit(X)` method
        of each one called.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame

        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list

        :param sensitive_features: A (currently) required keyword argument listing the
            feature used by the constraints object
        :type sensitive_features: numpy.ndarray, pandas.DataFrame, pandas.Series, or list (for now)
        """
        self.predictors_ = []
        self.lambda_vecs_ = pd.DataFrame(dtype=np.float64)
        self.objectives_ = []
        self.gammas_ = pd.DataFrame(dtype=np.float64)
        self.oracle_execution_times_ = []

        if isinstance(self.constraints, ClassificationMoment):
            logger.debug("Classification problem detected")
            is_classification_reduction = True
        else:
            logger.debug("Regression problem detected")
            is_classification_reduction = False

        _, y_train, sensitive_features_train = _validate_and_reformat_input(
            X, y, enforce_binary_labels=is_classification_reduction, **kwargs)

        kwargs[_KW_SENSITIVE_FEATURES] = sensitive_features_train

        # Prep the parity constraints and objective
        logger.debug("Preparing constraints and objective")
        self.constraints.load_data(X, y_train, **kwargs)
        objective = self.constraints.default_objective()
        objective.load_data(X, y_train, **kwargs)

        # Basis information
        pos_basis = self.constraints.pos_basis
        neg_basis = self.constraints.neg_basis
        neg_allowed = self.constraints.neg_basis_present
        objective_in_the_span = (self.constraints.default_objective_lambda_vec
                                 is not None)

        if self.grid is None:
            logger.debug("Creating grid of size %i", self.grid_size)
            grid = _GridGenerator(self.grid_size, self.grid_limit, pos_basis,
                                  neg_basis, neg_allowed,
                                  objective_in_the_span, self.grid_offset).grid
        else:
            logger.debug("Using supplied grid")
            grid = self.grid

        # Fit the estimates
        logger.debug("Setup complete. Starting grid search")
        for i in grid.columns:
            lambda_vec = grid[i]
            logger.debug("Obtaining weights")
            weights = self.constraints.signed_weights(lambda_vec)
            if not objective_in_the_span:
                weights = weights + objective.signed_weights()

            if is_classification_reduction:
                logger.debug("Applying relabelling for classification problem")
                y_reduction = 1 * (weights > 0)
                weights = weights.abs()
            else:
                y_reduction = y_train

            y_reduction_unique = np.unique(y_reduction)
            if len(y_reduction_unique) == 1:
                logger.debug(
                    "y_reduction had single value. Using DummyClassifier")
                current_estimator = DummyClassifier(
                    strategy='constant', constant=y_reduction_unique[0])
            else:
                logger.debug("Using underlying estimator")
                current_estimator = copy.deepcopy(self.estimator)

            oracle_call_start_time = time()
            current_estimator.fit(X, y_reduction, sample_weight=weights)
            oracle_call_execution_time = time() - oracle_call_start_time
            logger.debug("Call to estimator complete")

            def predict_fct(X):
                return current_estimator.predict(X)

            self.predictors_.append(current_estimator)
            self.lambda_vecs_[i] = lambda_vec
            self.objectives_.append(objective.gamma(predict_fct)[0])
            self.gammas_[i] = self.constraints.gamma(predict_fct)
            self.oracle_execution_times_.append(oracle_call_execution_time)

        logger.debug("Selecting best_result")
        if self.selection_rule == TRADEOFF_OPTIMIZATION:

            def loss_fct(i):
                return self.objective_weight * self.objectives_[i] + \
                    self.constraint_weight * self.gammas_[i].max()

            losses = [loss_fct(i) for i in range(len(self.objectives_))]
            self.best_idx_ = losses.index(min(losses))
        else:
            raise RuntimeError("Unsupported selection rule")

        return
Ejemplo n.º 13
0
    def fit(self, X, y, *, sensitive_features, **kwargs):
        """Fit the model.

        The fit is based on training features and labels, sensitive features,
        as well as the fairness-unaware predictor or estimator. If an estimator was passed
        in the constructor this fit method will call `fit(X, y, **kwargs)` on said estimator.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame
        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        :param sensitive_features: sensitive features to identify groups by, currently allows
            only a single column
        :type sensitive_features: currently 1D array as numpy.ndarray, list, pandas.DataFrame,
            or pandas.Series
        """
        if self.estimator is None:
            raise ValueError(BASE_ESTIMATOR_NONE_ERROR_MESSAGE)

        if self.constraints in SIMPLE_CONSTRAINTS:
            if self.objective not in OBJECTIVES_FOR_SIMPLE_CONSTRAINTS:
                raise ValueError(
                    NOT_SUPPORTED_OBJECTIVES_FOR_SIMPLE_CONSTRAINTS_ERROR_MESSAGE
                    .format(self.constraints))
        elif self.constraints == "equalized_odds":
            if self.objective not in OBJECTIVES_FOR_EQUALIZED_ODDS:
                raise ValueError(
                    NOT_SUPPORTED_OBJECTIVES_FOR_EQUALIZED_ODDS_ERROR_MESSAGE)
        else:
            raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE)

        _, _, sensitive_feature_vector = _validate_and_reformat_input(
            X,
            y,
            sensitive_features=sensitive_features,
            enforce_binary_labels=True)

        # postprocessing can't handle 0/1 as floating point numbers, so this converts it to int
        if type(y) in [np.ndarray, pd.DataFrame, pd.Series]:
            y = y.astype(int)
        else:
            y = [int(y_val) for y_val in y]

        if not self.prefit:
            # Following is on two lines due to issue when estimator comes from TensorFlow
            self.estimator_ = clone(self.estimator)
            self.estimator_.fit(X, y, **kwargs)
        else:
            try:
                check_is_fitted(self.estimator)
            except NotFittedError:
                warn(
                    BASE_ESTIMATOR_NOT_FITTED_WARNING.format(
                        type(self).__name__))
            self.estimator_ = self.estimator

        scores = self.estimator_.predict(X)
        if self.constraints == "equalized_odds":
            self.x_metric_ = "false_positive_rate"
            self.y_metric_ = "true_positive_rate"
            threshold_optimization_method = self._threshold_optimization_for_equalized_odds
        else:
            self.x_metric_ = SIMPLE_CONSTRAINTS[self.constraints]
            self.y_metric_ = self.objective
            threshold_optimization_method = self._threshold_optimization_for_simple_constraints

        self.interpolated_thresholder_ = threshold_optimization_method(
            sensitive_feature_vector, y, scores)
        return self
Ejemplo n.º 14
0
    def fit(self, X, y, **kwargs):
        """Return a fair classifier under specified fairness constraints.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame

        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        """
        self.lambda_vecs_EG_ = pd.DataFrame()
        self.lambda_vecs_LP_ = pd.DataFrame()
        self.lambda_vecs_ = pd.DataFrame()

        if isinstance(self.constraints, ClassificationMoment):
            logger.debug("Classification problem detected")
            is_classification_reduction = True
        else:
            logger.debug("Regression problem detected")
            is_classification_reduction = False

        _, y_train, sensitive_features = _validate_and_reformat_input(
            X, y, enforce_binary_labels=is_classification_reduction, **kwargs)

        n = y_train.shape[0]

        logger.debug("...Exponentiated Gradient STARTING")

        B = 1 / self.eps
        lagrangian = _Lagrangian(X, sensitive_features, y_train,
                                 self.estimator, self.constraints, self.eps, B)

        theta = pd.Series(0, lagrangian.constraints.index)
        Qsum = pd.Series(dtype="float64")
        gaps_EG = []
        gaps = []
        Qs = []

        last_regret_checked = _REGRET_CHECK_START_T
        last_gap = np.PINF
        for t in range(0, self.max_iter):
            logger.debug("...iter=%03d", t)

            # set lambdas for every constraint
            lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum())
            self.lambda_vecs_EG_[t] = lambda_vec
            lambda_EG = self.lambda_vecs_EG_.mean(axis=1)

            # select classifier according to best_h method
            h, h_idx = lagrangian.best_h(lambda_vec)

            if t == 0:
                if self.nu is None:
                    self.nu = _ACCURACY_MUL * (
                        h(X) - y_train).abs().std() / np.sqrt(n)
                eta_min = self.nu / (2 * B)
                eta = self.eta0 / B
                logger.debug(
                    "...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f",
                    self.eps, B, self.nu, self.max_iter, eta_min)

            if h_idx not in Qsum.index:
                Qsum.at[h_idx] = 0.0
            Qsum[h_idx] += 1.0
            gamma = lagrangian.gammas[h_idx]
            Q_EG = Qsum / Qsum.sum()
            result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self.nu)
            gap_EG = result_EG.gap()
            gaps_EG.append(gap_EG)

            if t == 0 or not self.run_linprog_step:
                gap_LP = np.PINF
            else:
                # saddle point optimization over the convex hull of
                # classifiers returned so far
                Q_LP, self.lambda_vecs_LP_[
                    t], result_LP = lagrangian.solve_linprog(self.nu)
                gap_LP = result_LP.gap()

            # keep values from exponentiated gradient or linear programming
            if gap_EG < gap_LP:
                Qs.append(Q_EG)
                gaps.append(gap_EG)
            else:
                Qs.append(Q_LP)
                gaps.append(gap_LP)

            logger.debug(
                "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f, gap=%.6f, disp=%.3f, "
                "err=%.3f, gap_LP=%.6f", _INDENTATION, eta,
                result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG,
                result_EG.gamma.max(), result_EG.error, gap_LP)

            if (gaps[t] < self.nu) and (t >= _MIN_ITER):
                # solution found
                break

            # update regret
            if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T:
                best_gap = min(gaps_EG)

                if best_gap > last_gap * _SHRINK_REGRET:
                    eta *= _SHRINK_ETA
                last_regret_checked = t
                last_gap = best_gap

            # update theta based on learning rate
            theta += eta * (gamma - self.eps)

        # retain relevant result data
        gaps_series = pd.Series(gaps)
        gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION]
        self.best_iter_ = gaps_best.index[-1]
        self.best_gap_ = gaps[self.best_iter_]
        self.weights_ = Qs[self.best_iter_]
        self._hs = lagrangian.hs
        for h_idx in self._hs.index:
            if h_idx not in self.weights_.index:
                self.weights_.at[h_idx] = 0.0

        self.last_iter_ = len(Qs) - 1
        self.predictors_ = lagrangian.predictors
        self.n_oracle_calls_ = lagrangian.n_oracle_calls
        self.n_oracle_calls_dummy_returned_ = lagrangian.n_oracle_calls_dummy_returned
        self.oracle_execution_times_ = lagrangian.oracle_execution_times
        self.lambda_vecs_ = lagrangian.lambdas

        logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f",
                     self.eps, B, self.nu, self.max_iter, eta_min)
        logger.debug(
            "...last_iter=%d, best_iter=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d",
            self.last_iter_, self.best_iter_, self.best_gap_,
            lagrangian.n_oracle_calls, len(lagrangian.predictors))
Ejemplo n.º 15
0
    def fit(self, X, y, **kwargs):
        """Return a fair classifier under specified fairness constraints.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame

        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        """
        _, y_train, A = _validate_and_reformat_input(X, y, **kwargs)

        n = y_train.shape[0]

        logger.debug("...Exponentiated Gradient STARTING")

        B = 1 / self._eps
        lagrangian = _Lagrangian(X, A, y_train, self._estimator,
                                 self._constraints, self._eps, B)

        theta = pd.Series(0, lagrangian.constraints.index)
        Qsum = pd.Series()
        lambdas = pd.DataFrame()
        gaps_EG = []
        gaps = []
        Qs = []

        last_regret_checked = _REGRET_CHECK_START_T
        last_gap = np.PINF
        for t in range(0, self._T):
            logger.debug("...iter=%03d", t)

            # set lambdas for every constraint
            lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum())
            lambdas[t] = lambda_vec
            lambda_EG = lambdas.mean(axis=1)

            # select classifier according to best_h method
            h, h_idx = lagrangian.best_h(lambda_vec)
            pred_h = h(X)

            if t == 0:
                if self._nu is None:
                    self._nu = _ACCURACY_MUL * (
                        pred_h - y_train).abs().std() / np.sqrt(n)
                eta_min = self._nu / (2 * B)
                eta = self._eta_mul / B
                logger.debug(
                    "...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f",
                    self._eps, B, self._nu, self._T, eta_min)

            if h_idx not in Qsum.index:
                Qsum.at[h_idx] = 0.0
            Qsum[h_idx] += 1.0
            gamma = lagrangian.gammas[h_idx]
            Q_EG = Qsum / Qsum.sum()
            result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self._nu)
            gap_EG = result_EG.gap()
            gaps_EG.append(gap_EG)

            if t == 0 or not _RUN_LP_STEP:
                gap_LP = np.PINF
            else:
                # saddle point optimization over the convex hull of
                # classifiers returned so far
                Q_LP, _, result_LP = lagrangian.solve_linprog(self._nu)
                gap_LP = result_LP.gap()

            # keep values from exponentiated gradient or linear programming
            if gap_EG < gap_LP:
                Qs.append(Q_EG)
                gaps.append(gap_EG)
            else:
                Qs.append(Q_LP)
                gaps.append(gap_LP)

            logger.debug(
                "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f"
                ", gap=%.6f, disp=%.3f, err=%.3f, gap_LP=%.6f", _INDENTATION,
                eta, result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG,
                result_EG.gamma.max(), result_EG.error, gap_LP)

            if (gaps[t] < self._nu) and (t >= _MIN_T):
                # solution found
                break

            # update regret
            if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T:
                best_gap = min(gaps_EG)

                if best_gap > last_gap * _SHRINK_REGRET:
                    eta *= _SHRINK_ETA
                last_regret_checked = t
                last_gap = best_gap

            # update theta based on learning rate
            theta += eta * (gamma - self._eps)

        # retain relevant result data
        gaps_series = pd.Series(gaps)
        gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION]
        self._best_t = gaps_best.index[-1]
        self._best_gap = gaps[self._best_t]
        self._weights = Qs[self._best_t]
        hs = lagrangian.hs
        for h_idx in hs.index:
            if h_idx not in self._weights.index:
                self._weights.at[h_idx] = 0.0

        self._last_t = len(Qs) - 1
        self._best_classifier = lambda X: _mean_pred(X, hs, self._weights)
        self._classifiers = lagrangian.classifiers
        self._n_oracle_calls = lagrangian.n_oracle_calls
        self._oracle_calls_execution_time = lagrangian.oracle_calls_execution_time

        logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f",
                     self._eps, B, self._nu, self._T, eta_min)
        logger.debug(
            "...last_t=%d, best_t=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d",
            self._last_t, self._best_t, self._best_gap,
            lagrangian.n_oracle_calls, len(lagrangian.classifiers))
Ejemplo n.º 16
0
    def fit(self, X, y, **kwargs):
        """Return a fair classifier under specified fairness constraints.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame

        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        """
        self.lambda_vecs_EG_ = pd.DataFrame()
        self.lambda_vecs_LP_ = pd.DataFrame()
        self.lambda_vecs_ = pd.DataFrame()
        #what are EG and LP representing? EG for exponentiated gradient, and LP for linear programming

        if isinstance(self.constraints, ClassificationMoment):
            logger.debug("Classification problem detected")
            is_classification_reduction = True
        else:
            logger.debug("Regression problem detected")
            is_classification_reduction = False

        _, y_train, sensitive_features = _validate_and_reformat_input(
            X, y, enforce_binary_labels=is_classification_reduction, **kwargs)
        # print("X:",X)
        # print("y_train:",y_train)
        # print("sensitive_features",sensitive_features)
        # #ote that certain estimators rely on metadata encoded in X which may be stripped during the reformatting
        #process, so mitigation methods should ideally use the input X instead of the returned X
        #for training estimators and leave potential reformatting of X to the estimator.
        n = y_train.shape[0]
        if self.error_weights is None:
            self.error_weights = pd.Series(1, y_train.index)
        else:
            self.error_weights = n * self.error_weights / self.error_weights.sum(
            )

        logger.debug("...Exponentiated Gradient STARTING")

        B = 1 / self.eps  #according to the error analysis part, B is proportional to the reciporacal of eps.
        lagrangian = _Lagrangian(X,
                                 sensitive_features,
                                 y_train,
                                 self.estimator,
                                 self.constraints,
                                 self.eps,
                                 B,
                                 error_weights=self.error_weights)

        theta = pd.Series(0,
                          lagrangian.constraints.index)  #starting value is 0.
        Qsum = pd.Series(dtype="float64")
        gaps_EG = []
        gaps = []
        Qs = []

        last_regret_checked = _REGRET_CHECK_START_T  #default value is 5
        last_gap = np.PINF
        for t in range(0, self.max_iter):
            logger.debug("...iter=%03d", t)

            # set lambdas for every constraint
            lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum())
            #print("t:",t)
            #print("lambda_vec:",lambda_vec)
            self.lambda_vecs_EG_[t] = lambda_vec
            lambda_EG = self.lambda_vecs_EG_.mean(axis=1)
            #lambda_hat, get the mean of lambdas from start to now.

            # select classifier according to best_h method
            h, h_idx = lagrangian.best_h(lambda_vec)
            #print("new best h index:",h_idx)
            #why do we set nu and learning rate eta to this value?
            if t == 0:
                if self.nu is None:
                    # self.nu = _ACCURACY_MUL * (h(X) - y_train).abs().std() / np.sqrt(n)
                    self.nu = _ACCURACY_MUL * (
                        self.error_weights *
                        (h(X) - y_train)).abs().std() / np.sqrt(n)
                    #print("nu:",self.nu)
                eta_min = self.nu / (2 * B)
                eta = self.eta0 / B
                logger.debug(
                    "...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f",
                    self.eps, B, self.nu, self.max_iter, eta_min)

            if h_idx not in Qsum.index:
                Qsum.at[h_idx] = 0.0
            #probably the best response is not a new one.
            Qsum[h_idx] += 1.0
            #print("Qsum:",Qsum)
            gamma = lagrangian.gammas[h_idx]
            Q_EG = Qsum / Qsum.sum()
            #Q_hat

            result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, self.nu)
            #inside this, calculate the L_high and L_low by calling best_h and best_lambda

            gap_EG = result_EG.gap()
            #print("gap_EG:",gap_EG)
            gaps_EG.append(gap_EG)

            if t == 0 or not self.run_linprog_step:
                gap_LP = np.PINF
            else:
                # saddle point optimization over the convex hull of
                # classifiers returned so far
                Q_LP, self.lambda_vecs_LP_[
                    t], result_LP = lagrangian.solve_linprog(self.nu)
                gap_LP = result_LP.gap()

            # keep values from exponentiated gradient or linear programming
            if gap_EG < gap_LP:
                #print("EG better!!!")
                Qs.append(Q_EG)
                gaps.append(gap_EG)
            else:
                #print("LP better!!!")
                Qs.append(Q_LP)
                gaps.append(gap_LP)

            logger.debug(
                "%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f, gap=%.6f, disp=%.3f, "
                "err=%.3f, gap_LP=%.6f", _INDENTATION, eta,
                result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG,
                result_EG.gamma.max(), result_EG.error, gap_LP)

            if (gaps[t] < self.nu) and (t >= _MIN_ITER):
                # solution found
                break

            # update regret
            if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T:
                best_gap = min(gaps_EG)

                if best_gap > last_gap * _SHRINK_REGRET:
                    eta *= _SHRINK_ETA
                last_regret_checked = t
                last_gap = best_gap

            # update theta based on learning rate
            theta += eta * (gamma - self.eps)

        # retain relevant result data
        #print("_PRECISION",_PRECISION)
        #print("Qs",Qs)
        gaps_series = pd.Series(gaps)
        #print("gaps_series",gaps_series)
        gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION]
        #print("gaps_best",gaps_best)
        self.best_iter_ = gaps_best.index[-1]
        #print("best_iter_",self.best_iter_)
        self.best_gap_ = gaps[self.best_iter_]
        #print("best_gap",self.best_gap_)
        self.weights_ = Qs[self.best_iter_]  ##best_Q
        #print("self.weights_",self.weights_)
        self._hs = lagrangian.hs
        for h_idx in self._hs.index:
            if h_idx not in self.weights_.index:
                self.weights_.at[h_idx] = 0.0

        self.last_iter_ = len(Qs) - 1
        self.predictors_ = lagrangian.predictors
        self.n_oracle_calls_ = lagrangian.n_oracle_calls
        self.n_oracle_calls_dummy_returned_ = lagrangian.n_oracle_calls_dummy_returned
        self.oracle_execution_times_ = lagrangian.oracle_execution_times
        self.lambda_vecs_ = lagrangian.lambdas

        logger.debug("...eps=%.3f, B=%.1f, nu=%.6f, max_iter=%d, eta_min=%.6f",
                     self.eps, B, self.nu, self.max_iter, eta_min)
        logger.debug(
            "...last_iter=%d, best_iter=%d, best_gap=%.6f, n_oracle_calls=%d, n_hs=%d",
            self.last_iter_, self.best_iter_, self.best_gap_,
            lagrangian.n_oracle_calls, len(lagrangian.predictors))
Ejemplo n.º 17
0
    def fit(self, X, y, **kwargs):
        """Run the grid search.

        This will result in multiple copies of the
        estimator being made, and the :code:`fit(X)` method
        of each one called.

        :param X: The feature matrix
        :type X: numpy.ndarray or pandas.DataFrame

        :param y: The label vector
        :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list

        :param sensitive_features: A (currently) required keyword argument listing the
            feature used by the constraints object
        :type sensitive_features: numpy.ndarray, pandas.DataFrame, pandas.Series, or list (for now)
        """
        if isinstance(self.constraints, ClassificationMoment):
            logger.debug("Classification problem detected")
            is_classification_reduction = True
        else:
            logger.debug("Regression problem detected")
            is_classification_reduction = False

        X_train, y_train, sensitive_features_train = _validate_and_reformat_input(
            X,
            y,
            enforce_binary_sensitive_feature=True,
            enforce_binary_labels=is_classification_reduction,
            **kwargs)

        kwargs[_KW_SENSITIVE_FEATURES] = sensitive_features_train

        # Prep the parity constraints and objective
        logger.debug("Preparing constraints and objective")
        self.constraints.load_data(X_train, y_train, **kwargs)
        objective = self.constraints.default_objective()
        objective.load_data(X_train, y_train, **kwargs)

        # Basis information
        pos_basis = self.constraints.pos_basis
        neg_basis = self.constraints.neg_basis
        neg_allowed = self.constraints.neg_basis_present
        objective_in_the_span = (self.constraints.default_objective_lambda_vec
                                 is not None)

        if self.grid is None:
            logger.debug("Creating grid of size %i", self.grid_size)
            grid = _GridGenerator(self.grid_size, self.grid_limit, pos_basis,
                                  neg_basis, neg_allowed,
                                  objective_in_the_span).grid
        else:
            logger.debug("Using supplied grid")
            grid = self.grid

        # Fit the estimates
        logger.debug("Setup complete. Starting grid search")
        self._all_results = []
        for i in grid.columns:
            lambda_vec = grid[i]
            logger.debug("Obtaining weights")
            weights = self.constraints.signed_weights(lambda_vec)
            if not objective_in_the_span:
                weights = weights + objective.signed_weights()

            if is_classification_reduction:
                logger.debug("Applying relabelling for classification problem")
                y_reduction = 1 * (weights > 0)
                weights = weights.abs()
            else:
                y_reduction = y_train

            current_estimator = copy.deepcopy(self.estimator)
            logger.debug("Calling underlying estimator")
            oracle_call_start_time = time()
            current_estimator.fit(X, y_reduction, sample_weight=weights)
            oracle_call_execution_time = time() - oracle_call_start_time
            logger.debug("Call to underlying estimator complete")

            def predict_fct(X):
                return current_estimator.predict(X)

            nxt = GridSearchResult(current_estimator, lambda_vec,
                                   objective.gamma(predict_fct)[0],
                                   self.constraints.gamma(predict_fct),
                                   oracle_call_execution_time)
            self._all_results.append(nxt)

        logger.debug("Selecting best_result")
        if self.selection_rule == TRADEOFF_OPTIMIZATION:

            def loss_fct(x):
                return self.objective_weight * x.objective + self.constraint_weight * x.gamma.max(
                )

            self._best_result = min(self._all_results, key=loss_fct)
        else:
            raise RuntimeError("Unsupported selection rule")

        return