def test_X_A_different_rows(self, transformX, transformY, transformA): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, _ = self._quick_data() A = np.random.randint(2, size=len(Y) + 1) message = str( "X and the sensitive features must have same number of rows") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert message == execInfo.value.args[0]
def test_Y_df_bad_columns(self, transformX, transformA): gs = GridSearch(self.estimator, self.disparity_criterion, self.quality_metric) X, Y, A = self._quick_data() Y_two_col_df = pd.DataFrame({"a": Y, "b": Y}) message = str("y is a DataFrame with more than one column") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), Y_two_col_df, sensitive_features=transformA(A)) assert message == execInfo.value.args[0]
def test_Y_ternary(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = _quick_data(A_two_dim) Y[0] = 0 Y[1] = 1 Y[2] = 2 with pytest.raises(ValueError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert _LABELS_NOT_0_1_ERROR_MESSAGE == execInfo.value.args[0]
def test_X_Y_different_rows(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion) X, _, A = _quick_data() Y = np.random.randint(2, size=len(A) + 1) with pytest.raises(ValueError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) expected_exception_message = "Found input variables with inconsistent numbers of samples" assert expected_exception_message in execInfo.value.args[0]
def test_Y_not_0_1(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion, sample_weight_name=self.sample_weight_name) X, Y, A = _quick_data(A_two_dim) Y = Y + 1 with pytest.raises(ValueError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert _LABELS_NOT_0_1_ERROR_MESSAGE == execInfo.value.args[0]
def test_A_df_bad_columns(self, transformX, transformY): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = self._quick_data() A_two_col_df = pd.DataFrame({"a": A, "b": A}) message = str( "sensitive_features is a DataFrame with more than one column") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=A_two_col_df) assert message == execInfo.value.args[0]
def test_A_ndarray_bad_columns(self, transformX, transformY): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = self._quick_data() A_two_col_ndarray = np.stack((A, A), -1) message = str( "sensitive_features is an ndarray with more than one column") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=A_two_col_ndarray) assert message == execInfo.value.args[0]
def test_Y_ndarray_bad_columns(self, transformX, transformA): gs = GridSearch(self.estimator, self.disparity_criterion, self.quality_metric) X, Y, A = self._quick_data() Y_two_col_ndarray = np.stack((Y, Y), -1) message = str("y is an ndarray with more than one column") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), Y_two_col_ndarray, sensitive_features=transformA(A)) assert message == execInfo.value.args[0]
def test_many_sensitive_feature_groups_warning(self, transformX, transformY, transformA, A_two_dim, caplog): # The purpose of this test case is to create enough groups to trigger certain expected # warnings. The scenario should still work and succeed. grid_size = 10 gs = GridSearch(self.estimator, self.disparity_criterion, grid_size=grid_size) X, Y, A = _quick_data(A_two_dim) if A_two_dim: A[0][0] = 0 A[0][1] = 0 A[1][0] = 1 A[1][1] = 1 A[2][0] = 2 A[2][1] = 2 A[3][0] = 3 A[3][1] = 3 A[4][0] = 4 A[4][1] = 4 A[5][0] = 5 A[5][1] = 5 else: A[0] = 0 A[1] = 1 A[2] = 2 A[3] = 3 A[4] = 4 A[5] = 5 caplog.set_level(logging.WARNING) gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) log_records = caplog.get_records('call') dimension_log_record = log_records[0] size_log_record = log_records[1] if isinstance(self.disparity_criterion, EqualizedOdds): # not every label occurs with every group grid_dimensions = 10 else: # 6 groups total, but one is not part of the basis, so 5 dimensions grid_dimensions = 5 # expect both the dimension warning and the grid size warning assert len(log_records) == 2 assert GRID_DIMENSION_WARN_TEMPLATE \ .format(grid_dimensions, GRID_DIMENSION_WARN_THRESHOLD) \ in dimension_log_record.msg.format(*dimension_log_record.args) assert GRID_SIZE_WARN_TEMPLATE.format(grid_size, 2**grid_dimensions) \ in size_log_record.msg.format(*size_log_record.args)
def test_Y_ternary(self, transformX, transformY, transformA): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = self._quick_data() Y[0] = 0 Y[1] = 1 Y[2] = 2 message = str("Supplied y labels are not 0 or 1") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert message == execInfo.value.args[0]
def test_sensitive_feature_non_binary(self, transformX, transformY, transformA): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = self._quick_data() A[0] = 0 A[1] = 1 A[2] = 2 message = str("Sensitive features contain more than two unique values") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert message == execInfo.value.args[0]
def test_Y_not_0_1(self, transformX, transformY, transformA): gs = GridSearch(self.estimator, self.disparity_criterion, self.quality_metric) X, Y, A = self._quick_data() Y = Y + 1 message = str("Supplied y labels are not 0 or 1") with pytest.raises(RuntimeError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A), number_of_lagrange_multipliers=3) assert message == execInfo.value.args[0]
def test_demographicparity_fair_uneven_populations(A_two_dim): # Variant of test_demographicparity_already_fair, which has unequal # populations in the two classes # Also allow the threshold to be adjustable score_threshold = 0.625 number_a0 = 4 number_a1 = 4 a0_label = 17 a1_label = 37 grid_size = 11 X, Y, A = _simple_threshold_data( number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim, ) grid_search = GridSearch( LogisticRegression(solver="liblinear", fit_intercept=True), constraints=DemographicParity(), grid_size=grid_size, ) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1], }) sample_results = grid_search.predict(test_X) assert np.array_equal(sample_results, [0, 1]) sample_proba = grid_search.predict_proba(test_X) assert np.allclose(sample_proba, [[0.53748641, 0.46251359], [0.46688736, 0.53311264]]) sample_results = grid_search.predictors_[0].predict(test_X) assert np.array_equal(sample_results, [1, 0])
def test_bgl_unfair(A_two_dim): a0_count = 5 a1_count = 7 a0_label = 2 a1_label = 3 a0_factor = 1 a1_factor = 16 grid_size = 7 X, Y, A = _simple_regression_data(a0_count, a1_count, a0_factor, a1_factor, a0_label, a1_label, A_two_dim) bgl_square_loss = BoundedGroupLoss(SquareLoss(-np.inf, np.inf)) grid_search = GridSearch(LinearRegression(), constraints=bgl_square_loss, grid_size=grid_size) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame({"actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1]}) best_predict = grid_search.predict(test_X) assert np.allclose([-1.91764706, 9.61176471], best_predict) all_predict = [predictor.predict(test_X) for predictor in grid_search.predictors_] # TODO: investigate where the different outcomes for the first grid point are from, likely # due to some ignored data points at the edge resulting in another solution with the same # least squares loss (i.e. both solutions acceptable). # Reflects https://github.com/fairlearn/fairlearn/issues/265 assert logging_all_close([[3.2, 11.2]], [all_predict[0]]) or \ logging_all_close([[3.03010885, 11.2]], [all_predict[0]]) assert logging_all_close([[-3.47346939, 10.64897959], [-2.68, 10.12], [-1.91764706, 9.61176471], [-1.18461538, 9.12307692], [-0.47924528, 8.65283019], [0.2, 0.7]], all_predict[1:])
def test_demographicparity_fair_uneven_populations_with_grid_offset( A_two_dim, offset): # Grid of Lagrangian multipliers has some initial offset score_threshold = 0.625 number_a0 = 4 number_a1 = 4 a0_label = 17 a1_label = 37 grid_size = 11 iterables = [['+', '-'], ['all'], [a0_label, a1_label]] midx = pd.MultiIndex.from_product(iterables, names=['sign', 'event', 'group_id']) grid_offset = pd.Series(offset, index=midx) X, Y, A = _simple_threshold_data(number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim) grid_search = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=DemographicParity(), grid_size=grid_size, grid_offset=grid_offset) grid_search.fit(X, Y, sensitive_features=A) assert_n_grid_search_results(grid_size, grid_search) test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1] }) sample_results = grid_search.predict(test_X) assert np.array_equal(sample_results, [0, 1]) sample_proba = grid_search.predict_proba(test_X) assert np.allclose(sample_proba, [[0.55069845, 0.44930155], [0.41546008, 0.58453992]]) sample_results = grid_search.predictors_[0].predict(test_X) assert np.array_equal(sample_results, [1, 0])
def test_X_A_different_rows(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch( self.estimator, self.disparity_criterion, sample_weight_name=self.sample_weight_name, ) X, Y, _ = _quick_data(A_two_dim) A = np.random.randint(2, size=len(Y) + 1) if A_two_dim: A = np.stack((A, A), -1) with pytest.raises(ValueError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) expected_exception_message = ( "Found input variables with inconsistent numbers of samples" ) assert expected_exception_message in execInfo.value.args[0]
def test_lambda_vec_zero_unchanged_model(A_two_dim): score_threshold = 0.6 number_a0 = 64 number_a1 = 24 a0_label = 7 a1_label = 22 X, y, A = _simple_threshold_data( number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label, A_two_dim, ) estimator = LogisticRegression(solver="liblinear", fit_intercept=True, random_state=97) # Train an unmitigated estimator unmitigated_estimator = copy.deepcopy(estimator) unmitigated_estimator.fit(X, y) # Do the grid search with a zero Lagrange multiplier iterables = [["+", "-"], ["all"], [a0_label, a1_label]] midx = pd.MultiIndex.from_product(iterables, names=["sign", "event", "group_id"]) lagrange_zero_series = pd.Series(np.zeros(4), index=midx) grid_df = pd.DataFrame(lagrange_zero_series) grid_search = GridSearch(estimator, constraints=DemographicParity(), grid=grid_df) grid_search.fit(X, y, sensitive_features=A) assert_n_grid_search_results(1, grid_search) # Check coefficients gs_coeff = grid_search.predictors_[grid_search.best_idx_].coef_ um_coeff = unmitigated_estimator.coef_ assert np.array_equal(gs_coeff, um_coeff)
def test_demographicparity_fair_uneven_populations(): # Variant of test_demographicparity_already_fair, which has unequal # populations in the two classes # Also allow the threshold to be adjustable score_threshold = 0.625 number_a0 = 4 number_a1 = 4 a0_label = 17 a1_label = 37 X, Y, A = _simple_threshold_data(number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label) target = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True), disparity_metric=moments.DemographicParity(), quality_metric=SimpleClassificationQualityMetric(), grid_size=11) target.fit(X, Y, sensitive_features=A) assert len(target.all_results) == 11 test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1] }) sample_results = target.predict(test_X) sample_proba = target.predict_proba(test_X) assert np.allclose(sample_proba, [[0.53748641, 0.46251359], [0.46688736, 0.53311264]]) sample_results = target.all_results[0].model.predict(test_X) assert np.array_equal(sample_results, [1, 0]) all_results = target.posterior_predict(test_X) assert len(all_results) == 11 all_proba = target.posterior_predict_proba(test_X) assert len(all_proba) == 11
def lagrangian(constraint, model, constraint_weight, grid_size, X_train, Y_train, A_train, X_test): """ Conduct lagrangian algorithm and set the base classifier as the black-box estimator to train and predict. """ start_time = datetime.now() if constraint == 'DP': clf = GridSearch(models[model], constraints=DemographicParity(), constraint_weight=constraint_weight, grid_size=grid_size) elif constraint == 'EO': clf = GridSearch(models[model], constraints=EqualizedOdds(), constraint_weight=constraint_weight, grid_size=grid_size) clf.fit(X_train, Y_train, sensitive_features=A_train) Y_pred = clf.predict(X_test) end_time = datetime.now() return Y_pred, time_diff_in_microseconds(end_time - start_time)
def run_gridsearch_classification(estimator, moment): """Run classification test with GridSearch.""" X_train, Y_train, A_train, X_test, Y_test, A_test = fetch_adult() verification_moment = copy.deepcopy(moment) unmitigated = copy.deepcopy(estimator) unmitigated.fit(X_train, Y_train) num_predictors = 11 gs = GridSearch(estimator, constraints=moment, grid_size=num_predictors) gs.fit(X_train, Y_train, sensitive_features=A_train) assert len(gs.predictors_) == num_predictors verification_moment.load_data(X_test, Y_test, sensitive_features=A_test) gamma_unmitigated = verification_moment.gamma( lambda x: unmitigated.predict(x)) gamma_mitigated = verification_moment.gamma(lambda x: gs.predict(x)) for idx in gamma_mitigated.index: assert abs(gamma_mitigated[idx]) <= abs( gamma_unmitigated[idx]), "Checking {0}".format(idx)
def evaluate(weight, X_train, y_train, X_test, y_test, sex_train, sex_test, index): estimator = GradientBoostingClassifier() constraints = DemographicParity() gssolver = GridSearch(estimator, constraints, grid_size=10, constraint_weight=weight) gssolver.fit(X_train, y_train, sensitive_features=sex_train) y_pred = gssolver.predict(X_test) # print("y_pred",y_pred) group_summary_adult = group_summary(accuracy_score, y_test, y_pred, sensitive_features=sex_test) selection_rate_summary = selection_rate_group_summary( y_test, y_pred, sensitive_features=sex_test) error = 1 - group_summary_adult["overall"] dp = demographic(selection_rate_summary) errorlist[index].append(error) dplist[index].append(dp) print("error:%f,dp:%f" % (error, dp))
def test_lagrange_multiplier_zero_unchanged_model(): score_threshold = 0.6 number_a0 = 64 number_a1 = 24 a0_label = 7 a1_label = 22 X, y, A = _simple_threshold_data(number_a0, number_a1, score_threshold, score_threshold, a0_label, a1_label) estimator = LogisticRegression(solver='liblinear', fit_intercept=True, random_state=97) # Train an unmitigated estimator unmitigated_estimator = copy.deepcopy(estimator) unmitigated_estimator.fit(X, y) # Do the grid search with a zero Lagrange multiplier iterables = [['+', '-'], ['all'], [a0_label, a1_label]] midx = pd.MultiIndex.from_product(iterables, names=['sign', 'event', 'group_id']) lagrange_zero_series = pd.Series(np.zeros(4), index=midx) grid_df = pd.DataFrame(lagrange_zero_series) target = GridSearch(estimator, disparity_metric=moments.DemographicParity(), quality_metric=SimpleClassificationQualityMetric(), grid=grid_df) target.fit(X, y, sensitive_features=A) assert len(target.all_results) == 1 # Check coefficients gs_coeff = target.best_result.model.coef_ um_coeff = unmitigated_estimator.coef_ assert np.array_equal(gs_coeff, um_coeff)
def test_bgl_unfair(): a0_count = 5 a1_count = 7 a0_label = 2 a1_label = 3 a0_factor = 1 a1_factor = 16 X, Y, A = _simple_regression_data(a0_count, a1_count, a0_factor, a1_factor, a0_label, a1_label) target = GridSearch(LinearRegression(), disparity_metric=moments.GroupLossMoment( moments.ZeroOneLoss()), quality_metric=SimpleRegressionQualityMetric(), grid_size=7) target.fit(X, Y, sensitive_features=A) assert len(target.all_results) == 7 test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1] }) best_predict = target.predict(test_X) assert np.allclose([-1.91764706, 9.61176471], best_predict) all_predict = target.posterior_predict(test_X) assert np.allclose( [[3.2, 11.2], [-3.47346939, 10.64897959], [-2.68, 10.12], [-1.91764706, 9.61176471], [-1.18461538, 9.12307692], [-0.47924528, 8.65283019], [0.2, 0.7]], all_predict)
def test_bgl_unfair(A_two_dim): a0_count = 5 a1_count = 7 a0_label = 2 a1_label = 3 a0_factor = 1 a1_factor = 16 X, Y, A = _simple_regression_data(a0_count, a1_count, a0_factor, a1_factor, a0_label, a1_label, A_two_dim) bgl_square_loss = GroupLossMoment(SquareLoss(-np.inf, np.inf)) target = GridSearch(LinearRegression(), constraints=bgl_square_loss, grid_size=7) target.fit(X, Y, sensitive_features=A) assert len(target.all_results) == 7 test_X = pd.DataFrame({ "actual_feature": [0.2, 0.7], "sensitive_features": [a0_label, a1_label], "constant_ones_feature": [1, 1] }) best_predict = target.predict(test_X) assert np.allclose([-1.91764706, 9.61176471], best_predict) all_predict = [r.predictor.predict(test_X) for r in target.all_results] assert logging_all_close( [[3.2, 11.2], [-3.47346939, 10.64897959], [-2.68, 10.12], [-1.91764706, 9.61176471], [-1.18461538, 9.12307692], [-0.47924528, 8.65283019], [0.2, 0.7]], all_predict)
def test_custom_grid(self, transformX, transformY, transformA): # Creating a standard grid with the default parameters grid_size = 10 grid_limit = 2.0 grid_offset = 0.1 disparity_moment = EqualizedOdds() X, y, A = _quick_data(False) disparity_moment.load_data(X, y, sensitive_features=A) grid = _GridGenerator( grid_size, grid_limit, disparity_moment.pos_basis, disparity_moment.neg_basis, disparity_moment.neg_basis_present, False, grid_offset, ).grid # Creating a custom grid by selecting only a few columns from the grid to try out indices = [7, 3, 4] grid = grid.iloc[:, indices] gs = GridSearch( estimator=LogisticRegression(solver="liblinear"), constraints=EqualizedOdds(), grid=grid, ) # Check that fit runs successfully with the custom grid gs.fit(transformX(X), transformY(y), sensitive_features=transformA(A)) # Check that it trained the correct number of predictors assert len(gs.predictors_) == len(grid.columns)
def test_sensitive_feature_non_binary(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion) X, Y, A = self._quick_data(A_two_dim) if A_two_dim: A[0][0] = 0 A[0][1] = 0 A[1][0] = 1 A[1][1] = 1 A[2][0] = 2 A[2][1] = 2 else: A[0] = 0 A[1] = 1 A[2] = 2 with pytest.raises(ValueError) as execInfo: gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert _SENSITIVE_FEATURES_NON_BINARY_ERROR_MESSAGE == execInfo.value.args[ 0]
def test_grid_size_warning_up_to_5_sensitive_feature_group( self, transformX, transformY, transformA, A_two_dim, n_groups, caplog ): if isinstance(self.disparity_criterion, EqualizedOdds): pytest.skip( "With EqualizedOdds there would be multiple warnings due to higher grid" " dimensionality." ) grid_size = 10 gs = GridSearch( self.estimator, self.disparity_criterion, grid_size=grid_size, sample_weight_name=self.sample_weight_name, ) X, Y, A = _quick_data(A_two_dim, n_groups=n_groups) caplog.set_level(logging.WARNING) gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) # don't expect the dimension warning; # but expect the grid size warning for large numbers of groups log_records = caplog.get_records("call") # 6 groups total, but one is not part of the basis, so 5 dimensions grid_dimensions = n_groups - 1 if 2 ** (n_groups - 1) > grid_size: assert len(log_records) == 1 size_log_record = log_records[0] assert GRID_SIZE_WARN_TEMPLATE.format( grid_size, 2**grid_dimensions ) in size_log_record.msg.format(*size_log_record.args) else: assert len(log_records) == 0
def test_valid_inputs(self, transformX, transformY, transformA, A_two_dim): gs = GridSearch(self.estimator, self.disparity_criterion, grid_size=2) X, Y, A = self._quick_data(A_two_dim) gs.fit(transformX(X), transformY(Y), sensitive_features=transformA(A)) assert len(gs.all_results) == 2
# metric will not be obvious. sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=DemographicParity(), grid_size=71) # %% # Our algorithms provide :code:`fit()` and :code:`predict()` methods, so they behave in a similar manner # to other ML packages in Python. # We do however have to specify two extra arguments to :code:`fit()` - the column of sensitive # feature labels, and also the number of predictors to generate in our sweep. # # After :code:`fit()` completes, we extract the full set of predictors from the # :class:`fairlearn.reductions.GridSearch` object. sweep.fit(X_train, Y_train, sensitive_features=A_train) predictors = sweep.predictors_ # %% # We could load these predictors into the Fairness dashboard now. # However, the plot would be somewhat confusing due to their number. # In this case, we are going to remove the predictors which are dominated in the # error-disparity space by others from the sweep (note that the disparity will only be # calculated for the sensitive feature; other potentially sensitive features will # not be mitigated). # In general, one might not want to do this, since there may be other considerations # beyond the strict optimization of error and disparity (of the given sensitive feature). errors, disparities = [], [] for m in predictors:
import sys from train_models import train_fairlearn_model import json from fairlearn.reductions import GridSearch, GroupLossMoment, ZeroOneLoss from sklearn.linear_model import LogisticRegression dataset = sys.argv[1] dataset_name = dataset.split('/')[-1].split('.')[0] attributes = sys.argv[2] seed = int(sys.argv[3]) rdir = sys.argv[4] # set up gridsearch model model = LogisticRegression() sweep = GridSearch(model, constraints=EqualizedOdds(), grid_size=100, grid_limit=2) sweep.fit(df_train_balanced, Y_train_balanced, sensitive_features=A_train_balanced) # train # perf, hv = train_gerryfair_model(fair_model, 'gerryfair', dataset, # attributes, seed, rdir)