Esempio n. 1
0
    def run_smoke_test(self, data):
        expgrad = ExponentiatedGradient(self.learner, constraints=data["cons_class"](),
                                        eps=data["eps"])
        expgrad.fit(self.X, self.y, sensitive_features=self.A)

        res = expgrad._expgrad_result._as_dict()
        Q = res["best_classifier"]
        res["n_classifiers"] = len(res["classifiers"])

        disp = data["cons_class"]()
        disp.load_data(self.X, self.y, sensitive_features=self.A)
        error = ErrorRate()
        error.load_data(self.X, self.y, sensitive_features=self.A)
        res["disp"] = disp.gamma(Q).max()
        res["error"] = error.gamma(Q)[0]

        assert res["best_gap"] == pytest.approx(
            data["best_gap"], abs=self._PRECISION)
        assert res["last_t"] == data["last_t"]
        assert res["best_t"] == data["best_t"]
        assert res["disp"] == pytest.approx(data["disp"], abs=self._PRECISION)
        assert res["error"] == pytest.approx(
            data["error"], abs=self._PRECISION)
        assert res["n_oracle_calls"] == data["n_oracle_calls"]
        assert res["n_classifiers"] == data["n_classifiers"]
    def run_smoke_test(self, data, flipped=False):
        ratio = 1.0
        if "ratio" in data.keys():
            ratio = data["ratio"]
        expgrad = ExponentiatedGradient(
            self.learner,
            constraints=data["cons_class"](ratio=ratio),
            eps=data["eps"])
        expgrad.fit(self.X, (self.flipped_y if flipped else self.y),
                    sensitive_features=self.A)

        def Q(X):
            return expgrad._pmf_predict(X)[:, 1]

        n_predictors = len(expgrad._predictors)

        disparity_moment = data["cons_class"](ratio=ratio)
        disparity_moment.load_data(self.X,
                                   (self.flipped_y if flipped else self.y),
                                   sensitive_features=self.A)
        error = ErrorRate()
        error.load_data(self.X, (self.flipped_y if flipped else self.y),
                        sensitive_features=self.A)
        disparity = disparity_moment.gamma(Q).max()
        error = error.gamma(Q)[0]

        assert expgrad._best_gap == pytest.approx(data["best_gap"],
                                                  abs=self._PRECISION)
        assert expgrad._last_t == data["last_t"]
        assert expgrad._best_t == data["best_t"]
        assert disparity == pytest.approx(data["disp"], abs=self._PRECISION)
        assert error == pytest.approx(data["error"], abs=self._PRECISION)
        assert expgrad._n_oracle_calls == data["n_oracle_calls"]
        assert n_predictors == data["n_predictors"]
    def test_argument_types(self, transformX, transformY, transformA,
                            A_two_dim):
        # This is an expanded-out version of one of the smoke tests
        X, y, A = _get_data(A_two_dim)
        merged_A = _map_into_single_column(A)

        expgrad = ExponentiatedGradient(LeastSquaresBinaryClassifierLearner(),
                                        constraints=DemographicParity(),
                                        eps=0.1)
        expgrad.fit(transformX(X),
                    transformY(y),
                    sensitive_features=transformA(A))

        Q = expgrad._best_classifier
        n_classifiers = len(expgrad._classifiers)

        disparity_moment = DemographicParity()
        disparity_moment.load_data(X, y, sensitive_features=merged_A)
        error = ErrorRate()
        error.load_data(X, y, sensitive_features=merged_A)
        disparity = disparity_moment.gamma(Q).max()
        error = error.gamma(Q)[0]

        assert expgrad._best_gap == pytest.approx(0.0000, abs=_PRECISION)
        assert expgrad._last_t == 5
        assert expgrad._best_t == 5
        assert disparity == pytest.approx(0.1, abs=_PRECISION)
        assert error == pytest.approx(0.25, abs=_PRECISION)
        assert expgrad._n_oracle_calls == 32
        assert n_classifiers == 3
    def run_smoke_test_binary_classification(self, data, flipped=False):
        learner = LeastSquaresBinaryClassifierLearner()
        if "ratio" in data.keys():
            disparity_moment = data["constraint_class"](
                ratio_bound_slack=data["eps"], ratio_bound=data["ratio"])
        else:
            disparity_moment = data["constraint_class"](
                difference_bound=data["eps"])

        # Create Exponentiated Gradient object with a copy of the constraint.
        # The original disparity_moment object is used for validation, so the
        # assumption is that the moment logic is correct in these tests.
        expgrad = ExponentiatedGradient(learner,
                                        constraints=deepcopy(disparity_moment),
                                        eps=data["eps"])

        X, y, A = _get_data(A_two_dim=False, flip_y=flipped)

        expgrad.fit(X, y, sensitive_features=A)

        self._assert_expgrad_state(expgrad, data)

        # select probability of predicting 1
        def Q(X):
            return expgrad._pmf_predict(X)[:, 1]

        default_objective = ErrorRate()
        disparity_moment.load_data(X, y, sensitive_features=A)
        default_objective.load_data(X, y, sensitive_features=A)
        disparity = disparity_moment.gamma(Q).max()
        error = default_objective.gamma(Q)[0]
        assert disparity == pytest.approx(data["disp"], abs=_PRECISION)
        assert error == pytest.approx(data["error"], abs=_PRECISION)
Esempio n. 5
0
    def test_argument_types(self, transformX, transformY, transformA):
        # This is an expanded-out version of one of the smoke tests
        expgrad = ExponentiatedGradient(self.learner,
                                        constraints=DemographicParity(),
                                        eps=0.1)
        expgrad.fit(transformX(self.X),
                    transformY(self.y),
                    sensitive_features=transformA(self.A))

        res = expgrad._expgrad_result._as_dict()
        Q = res["best_classifier"]
        res["n_classifiers"] = len(res["classifiers"])

        disp = DemographicParity()
        disp.load_data(self.X, self.y, sensitive_features=self.A)
        error = ErrorRate()
        error.load_data(self.X, self.y, sensitive_features=self.A)
        res["disp"] = disp.gamma(Q).max()
        res["error"] = error.gamma(Q)[0]

        assert res["best_gap"] == pytest.approx(0.0000, abs=self._PRECISION)
        assert res["last_t"] == 5
        assert res["best_t"] == 5
        assert res["disp"] == pytest.approx(0.1, abs=self._PRECISION)
        assert res["error"] == pytest.approx(0.25, abs=self._PRECISION)
        assert res["n_oracle_calls"] == 32
        assert res["n_classifiers"] == 3
Esempio n. 6
0
    def test_argument_types(self, transformX, transformY, transformA,
                            A_two_dim):
        # This is an expanded-out version of one of the smoke tests
        X, y, A = _get_data(A_two_dim)
        merged_A = _map_into_single_column(A)

        expgrad = ExponentiatedGradient(LeastSquaresBinaryClassifierLearner(),
                                        constraints=DemographicParity(),
                                        eps=0.1)
        expgrad.fit(transformX(X),
                    transformY(y),
                    sensitive_features=transformA(A))

        res = expgrad._expgrad_result._as_dict()
        Q = res["best_classifier"]
        res["n_classifiers"] = len(res["classifiers"])

        disp = DemographicParity()
        disp.load_data(X, y, sensitive_features=merged_A)
        error = ErrorRate()
        error.load_data(X, y, sensitive_features=merged_A)
        res["disp"] = disp.gamma(Q).max()
        res["error"] = error.gamma(Q)[0]

        assert res["best_gap"] == pytest.approx(0.0000, abs=_PRECISION)
        assert res["last_t"] == 5
        assert res["best_t"] == 5
        assert res["disp"] == pytest.approx(0.1, abs=_PRECISION)
        assert res["error"] == pytest.approx(0.25, abs=_PRECISION)
        assert res["n_oracle_calls"] == 32
        assert res["n_classifiers"] == 3
Esempio n. 7
0
def gridSearch(model, X_train, Y_train, A_train, grid_size):
    """
    Generates a sequence of relabellings and reweightings, and trains a predictor for each. 
    Only applicable for binary feature.
    
    Parameters:
    x_train: input data for training model
    y_train: list of ground truths
    model: the unmitigated algorthmic model
    
    Returns a dataframe of the different predictors and its accuracy scores and disparity scores.
    
    """
    sweep = GridSearch(model,
                       constraints=DemographicParity(),
                       grid_size=grid_size)

    # we extract the full set of predictors from the `GridSearch` object
    sweep.fit(X_train, Y_train, sensitive_features=A_train)

    predictors = sweep._predictors
    """
    Remove the predictors which are dominated in the error-disparity space by others from the sweep 
    (note that the disparity will only be calculated for the protected attribute; 
    other potentially protected attributes will not be mitigated)
   
    In general, one might not want to do this, since there may be other considerations beyond the strict 
    optimisation of error and disparity (of the given protected attribute).
    """
    errors, disparities = [], []
    for m in predictors:
        classifier = lambda X: m.predict(X)

        error = ErrorRate()
        error.load_data(X_train,
                        pd.Series(Y_train),
                        sensitive_features=A_train)
        disparity = DemographicParity()
        disparity.load_data(X_train,
                            pd.Series(Y_train),
                            sensitive_features=A_train)

        errors.append(error.gamma(classifier)[0])
        disparities.append(disparity.gamma(classifier).max())

    all_results = pd.DataFrame({
        "predictor": predictors,
        "error": errors,
        "disparity": disparities
    })

    non_dominated = []
    for row in all_results.itertuples():
        errors_for_lower_or_eq_disparity = all_results["error"][
            all_results["disparity"] <= row.disparity]
        if row.error <= errors_for_lower_or_eq_disparity.min():
            non_dominated.append(row.predictor)

    return non_dominated
    def run_smoke_test(self, data, flipped=False):
        if flipped:
            y = self.flipped_y
        else:
            y = self.y

        if "ratio" in data.keys():
            expgrad = ExponentiatedGradient(self.learner,
                                            constraints=data["cons_class"](
                                                ratio_bound_slack=data["eps"],
                                                ratio_bound=data["ratio"]),
                                            eps=data["eps"])
        else:
            expgrad = ExponentiatedGradient(
                self.learner,
                constraints=data["cons_class"](difference_bound=data["eps"]),
                eps=data["eps"])

        expgrad.fit(self.X, y, sensitive_features=self.A)

        def Q(X):
            return expgrad._pmf_predict(X)[:, 1]

        n_predictors = len(expgrad.predictors_)

        if "ratio" in data.keys():
            disparity_moment = data["cons_class"](
                ratio_bound_slack=data["eps"], ratio_bound=data["ratio"])
        else:
            disparity_moment = data["cons_class"](difference_bound=data["eps"])

        disparity_moment.load_data(self.X, y, sensitive_features=self.A)
        error = ErrorRate()
        error.load_data(self.X, y, sensitive_features=self.A)
        disparity = disparity_moment.gamma(Q).max()
        error = error.gamma(Q)[0]

        assert expgrad.best_gap_ == pytest.approx(data["best_gap"],
                                                  abs=self._PRECISION)
        assert expgrad.last_iter_ == data["last_iter"]
        assert expgrad.best_iter_ == data["best_iter"]
        assert expgrad.last_iter_ >= _MIN_ITER
        assert disparity == pytest.approx(data["disp"], abs=self._PRECISION)
        assert error == pytest.approx(data["error"], abs=self._PRECISION)
        assert expgrad.n_oracle_calls_ == data["n_oracle_calls"]
        assert expgrad.n_oracle_calls_dummy_returned_ == data[
            "n_oracle_calls_dummy_returned"]
        assert n_predictors == data["n_predictors"]
        assert len(expgrad.oracle_execution_times_) == expgrad.n_oracle_calls_
    def __remove_predictors_dominated_error_disparity_by_sweep(self, predictors, X_train, Y_train, A_train):
        errors, disparities = [], []
        for m in predictors:
            def classifier(X): return m.predict(X)

            error = ErrorRate()
            error.load_data(X_train, pd.Series(Y_train),
                            sensitive_features=A_train.diabetic)
            disparity = DemographicParity()
            disparity.load_data(X_train, pd.Series(
                Y_train), sensitive_features=A_train.diabetic)

            errors.append(error.gamma(classifier)[0])
            disparities.append(disparity.gamma(classifier).max())

        return pd.DataFrame({"predictor": predictors, "error": errors, "disparity": disparities})
    def test_argument_types_ratio_bound(self, transformX, transformY,
                                        transformA, A_two_dim):
        # This is an expanded-out version of one of the smoke tests
        X, y, A = _get_data(A_two_dim)
        merged_A = _map_into_single_column(A)

        transformed_X = transformX(X)
        transformed_y = transformY(y)
        transformed_A = transformA(A)
        eps = 0.1
        ratio = 1.0

        expgrad = ExponentiatedGradient(
            LeastSquaresBinaryClassifierLearner(),
            constraints=DemographicParity(ratio_bound_slack=eps,
                                          ratio_bound=ratio),
            eps=eps,
        )
        expgrad.fit(transformed_X,
                    transformed_y,
                    sensitive_features=transformed_A)

        def Q(X):
            return expgrad._pmf_predict(X)[:, 1]

        n_predictors = len(expgrad.predictors_)

        disparity_moment = DemographicParity(ratio_bound_slack=eps,
                                             ratio_bound=ratio)
        disparity_moment.load_data(X, y, sensitive_features=merged_A)
        error = ErrorRate()
        error.load_data(X, y, sensitive_features=merged_A)
        disparity = disparity_moment.gamma(Q).max()
        disp = disparity_moment.gamma(Q)
        disp_eps = disparity_moment.gamma(Q) - disparity_moment.bound()
        error = error.gamma(Q)[0]

        assert expgrad.best_gap_ == pytest.approx(0.0000, abs=_PRECISION)
        assert expgrad.last_iter_ == 5
        assert expgrad.best_iter_ == 5
        assert disparity == pytest.approx(0.1, abs=_PRECISION)
        assert np.all(np.isclose(disp - eps, disp_eps))
        assert error == pytest.approx(0.25, abs=_PRECISION)
        assert expgrad.n_oracle_calls_ == 32
        assert n_predictors == 3
    def run_smoke_test(self, data):
        expgrad = ExponentiatedGradient(self.learner,
                                        constraints=data["cons_class"](),
                                        eps=data["eps"])
        expgrad.fit(self.X, self.y, sensitive_features=self.A)

        Q = expgrad._best_classifier
        n_classifiers = len(expgrad._classifiers)

        disparity_moment = data["cons_class"]()
        disparity_moment.load_data(self.X, self.y, sensitive_features=self.A)
        error = ErrorRate()
        error.load_data(self.X, self.y, sensitive_features=self.A)
        disparity = disparity_moment.gamma(Q).max()
        error = error.gamma(Q)[0]

        assert expgrad._best_gap == pytest.approx(data["best_gap"],
                                                  abs=self._PRECISION)
        assert expgrad._last_t == data["last_t"]
        assert expgrad._best_t == data["best_t"]
        assert disparity == pytest.approx(data["disp"], abs=self._PRECISION)
        assert error == pytest.approx(data["error"], abs=self._PRECISION)
        assert expgrad._n_oracle_calls == data["n_oracle_calls"]
        assert n_classifiers == data["n_classifiers"]
Esempio n. 12
0
# beyond the strict optimization of error and disparity (of the given sensitive feature).

errors, disparities = [], []
for m in predictors:

    def classifier(X):
        return m.predict(X)

    error = ErrorRate()
    error.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train)
    disparity = DemographicParity()
    disparity.load_data(X_train,
                        pd.Series(Y_train),
                        sensitive_features=A_train)

    errors.append(error.gamma(classifier)[0])
    disparities.append(disparity.gamma(classifier).max())

all_results = pd.DataFrame({
    "predictor": predictors,
    "error": errors,
    "disparity": disparities
})

non_dominated = []
for row in all_results.itertuples():
    errors_for_lower_or_eq_disparity = all_results["error"][
        all_results["disparity"] <= row.disparity]
    if row.error <= errors_for_lower_or_eq_disparity.min():
        non_dominated.append(row.predictor)