def test_custom_grid(self, transformX, transformY, transformA): # Creating a standard grid with the default parameters grid_size = 10 grid_limit = 2.0 grid_offset = 0.1 disparity_moment = EqualizedOdds() X, y, A = _quick_data(False) disparity_moment.load_data(X, y, sensitive_features=A) grid = _GridGenerator( grid_size, grid_limit, disparity_moment.pos_basis, disparity_moment.neg_basis, disparity_moment.neg_basis_present, False, grid_offset).grid # Creating a custom grid by selecting only a few columns from the grid to try out indices = [7, 3, 4] grid = grid.iloc[:, indices] gs = GridSearch( estimator=LogisticRegression(solver='liblinear'), constraints=EqualizedOdds(), grid=grid, ) # Check that fit runs successfully with the custom grid gs.fit( transformX(X), transformY(y), sensitive_features=transformA(A)) # Check that it trained the correct number of predictors assert len(gs.predictors_) == len(grid.columns)
def test_signed_weights(): eqo = EqualizedOdds() assert eqo.short_name == "EqualizedOdds" num_samples_a0 = 10 num_samples_a1 = 30 num_samples = num_samples_a0 + num_samples_a1 a0_threshold = 0.2 a1_threshold = 0.7 a0_label = 0xDEAD a1_label = 0xBEEF X, Y, A = simple_binary_threshold_data(num_samples_a0, num_samples_a1, a0_threshold, a1_threshold, a0_label, a1_label) # Load up the (rigged) data eqo.load_data(X, Y, sensitive_features=A) events = ['label=False', 'label=True'] signs = ["+", "-"] labels = [a0_label, a1_label] midx = pd.MultiIndex.from_product([signs, events, labels], names=[_SIGN, _EVENT, _GROUP_ID]) lambda_vec = pd.Series([2000, 1000, 4000, 5000, 500, 100, 700, 900], index=midx, name=0) lambda_a0_F = 2000 - 500 lambda_a0_T = 4000 - 700 num_a0_F = int(a0_threshold * num_samples_a0) num_a0_T = num_samples_a0 - num_a0_F lambda_a1_F = 1000 - 100 lambda_a1_T = 5000 - 900 num_a1_F = int(a1_threshold * num_samples_a1) num_a1_T = num_samples_a1 - num_a1_F sw_a0_F = (lambda_a0_F + lambda_a1_F) / (1 - sum(Y) / len(Y)) - \ lambda_a0_F * (num_samples / num_a0_F) sw_a1_F = (lambda_a0_F + lambda_a1_F) / (1 - sum(Y) / len(Y)) - \ lambda_a1_F * (num_samples / num_a1_F) sw_a0_T = (lambda_a0_T + lambda_a1_T) / (sum(Y) / len(Y)) - \ lambda_a0_T * (num_samples / num_a0_T) sw_a1_T = (lambda_a0_T + lambda_a1_T) / (sum(Y) / len(Y)) - \ lambda_a1_T * (num_samples / num_a1_T) w_a0_F = np.full(num_a0_F, sw_a0_F) w_a0_T = np.full(num_a0_T, sw_a0_T) w_a1_F = np.full(num_a1_F, sw_a1_F) w_a1_T = np.full(num_a1_T, sw_a1_T) expected = np.concatenate((w_a0_F, w_a0_T, w_a1_F, w_a1_T), axis=None) signed_weights = eqo.signed_weights(lambda_vec) # Be bold and test for equality assert np.array_equal(expected, signed_weights)
def test_equalized_odds(): # Have to do this one longhand, since it combines tpr and fpr X, y = loan_scenario_generator(n, f, sfs, ibs, seed=632753) X_dummy = pd.get_dummies(X) metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate} unmitigated = LogisticRegression() unmitigated.fit(X_dummy, y) y_pred = unmitigated.predict(X_dummy) mf_unmitigated = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred, sensitive_features=X["sens"], control_features=X["ctrl"], ) expgrad_basic = ExponentiatedGradient( LogisticRegression(), constraints=EqualizedOdds(difference_bound=0.01), eps=0.01) expgrad_basic.fit(X_dummy, y, sensitive_features=X["sens"]) y_pred_basic = expgrad_basic.predict(X_dummy, random_state=9235) mf_basic = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred_basic, sensitive_features=X["sens"], control_features=X["ctrl"], ) expgrad_control = ExponentiatedGradient( LogisticRegression(), constraints=EqualizedOdds(difference_bound=0.01), eps=0.01) expgrad_control.fit(X_dummy, y, sensitive_features=X["sens"], control_features=X["ctrl"]) y_pred_control = expgrad_control.predict(X_dummy, random_state=8152) mf_control = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred_control, sensitive_features=X["sens"], control_features=X["ctrl"], ) compare_unmitigated = mf_control.difference( method="to_overall") <= mf_unmitigated.difference(method="to_overall") print(compare_unmitigated) compare_basic = mf_control.difference( method="to_overall") <= mf_basic.difference(method="to_overall") print(compare_basic) assert compare_basic.values.reshape(6).all() assert compare_unmitigated.values.reshape(6).all()
def test_error_rate_consistency(self, eps, ratio, pos_copies): learner = LeastSquaresBinaryClassifierLearner() if ratio is None: constraints_moment = EqualizedOdds(difference_bound=eps) else: constraints_moment = EqualizedOdds(ratio_bound=ratio, ratio_bound_slack=eps) results = {} for method in ["costs", "sampling"]: X, y, A = _get_data() if method == "sampling": select = y == 1 X = pd.concat((X,) + (X.loc[select, :],) * pos_copies).values y = pd.concat((y,) + (y[select],) * pos_copies).values A = pd.concat((A,) + (A[select],) * pos_copies).values objective_moment = ErrorRate() else: objective_moment = ErrorRate(costs={"fn": 1.0 + pos_copies, "fp": 1.0}) expgrad = ExponentiatedGradient( learner, constraints=deepcopy(constraints_moment), objective=deepcopy(objective_moment), eps=eps, nu=1e-3, ) expgrad.fit(X, y, sensitive_features=A) # select probability of predicting 1 def Q(X): return expgrad._pmf_predict(X)[:, 1] constraints_eval = deepcopy(constraints_moment) constraints_eval.load_data(X, y, sensitive_features=A) disparity = constraints_eval.gamma(Q).max() objective_eval = deepcopy(objective_moment) objective_eval.load_data(X, y, sensitive_features=A) total_error = objective_eval.gamma(Q)[0] * len(y) results[method] = { "error": objective_eval.gamma(Q)[0], "total_error": total_error, "disp": disparity, "n_predictors": len(expgrad.predictors_), "best_gap": expgrad.best_gap_, "last_iter": expgrad.last_iter_, "best_iter": expgrad.best_iter_, "n_oracle_calls": expgrad.n_oracle_calls_, "n_oracle_calls_dummy_returned": expgrad.n_oracle_calls_dummy_returned_, } self._assert_expgrad_two_states(results["costs"], results["sampling"])
def run_estimation(fairness_constraints, proxy=False, lnl=False): print( f"Start running experiment with Proxy: {proxy}, Learning with Noisy Labels: {lnl}." ) all_results_train, all_results_test = [], [] for eps in fairness_constraints: begin = time.time() if proxy and lnl: clf = ExponentiatedGradient( LogisticRegression(solver='liblinear', fit_intercept=True), constraints=ProxyEqualizedOdds2(delta=delta), eps=eps) sweep = LearningWithNoisyLabels(clf=clf) elif proxy: sweep = ExponentiatedGradient( LogisticRegression(solver='liblinear', fit_intercept=True), constraints=ProxyEqualizedOdds2(delta=delta), eps=eps) elif lnl: clf = ExponentiatedGradient(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=EqualizedOdds(), eps=eps) sweep = LearningWithNoisyLabels(clf=clf) else: sweep = ExponentiatedGradient(LogisticRegression( solver='liblinear', fit_intercept=True), constraints=EqualizedOdds(), eps=eps) sweep.fit(X_train, Y_noised, sensitive_features=A_train) prediction_train = sweep.predict(X_train) prediction_test = sweep.predict(X_test) accuracy_train = accuracy(prediction_train, Y_train) accuracy_test = accuracy(prediction_test, Y_test) all_results_train.append(accuracy_train) all_results_test.append(accuracy_test) print( f"Running fairness constraint: {eps}, Training Accuracy: {accuracy_train}, Test Accuracy: {accuracy_test}, Training Violation: {violation(prediction_train, Y_train, A_train)}, Test Violation: {violation(prediction_test, Y_test, A_test)}, Time cost: {time.time() - begin}" ) return all_results_train, all_results_test
def test_grid_generator_equalized_odds_basic(grid_limit): # Equalized odds has four rows with potential non-zero values in the grid. # grid_size = 5 ensures that each of the groups have their own column. grid_size = 5 disparity_moment = EqualizedOdds() label0 = 'label=0' label1 = 'label=1' events = [label0, label1] grid = calculate_grid(grid_limit, grid_size, disparity_moment) expected_index = pd.MultiIndex.from_product( [['+', '-'], events, [0, 1]], names=[_SIGN, _EVENT, _GROUP_ID]) assert (expected_index == grid.index).all() expected_grid = pd.DataFrame() for i in range(grid_size): expected_grid[i] = pd.Series(0.0, index=expected_index) expected_grid[0]['-', label0, 1] = grid_limit expected_grid[1]['-', label1, 1] = grid_limit expected_grid[3]['+', label1, 1] = grid_limit expected_grid[4]['+', label0, 1] = grid_limit assert np.isclose(expected_grid.values, grid.values).all()
def test_grid_generator_equalized_odds(grid_limit, grid_size): # Equalized odds has four rows with potential non-zero values in the grid. # With grid_size = 13 we get exactly one column with the grid_limit value per row, # one column with half the grid_limit value per row, and combinations of rows disparity_moment = EqualizedOdds() label0 = 'label=0' label1 = 'label=1' events = [label0, label1] grid = calculate_grid(grid_limit, grid_size, disparity_moment) expected_index = pd.MultiIndex.from_product( [['+', '-'], events, [0, 1]], names=[_SIGN, _EVENT, _GROUP_ID]) assert (expected_index == grid.index).all() expected_grid = pd.DataFrame() for i in range(grid_size): expected_grid[i] = pd.Series(0.0, index=expected_index) gl = grid_limit # abbreviation for readibility expected_grid.loc['+', label0, 1] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, gl / 2, gl / 2, gl / 2, gl ] expected_grid.loc['+', label1, 1] = [ 0, 0, 0, gl / 2, 0, 0, 0, gl / 2, gl, 0, 0, gl / 2, 0 ] expected_grid.loc['-', label0, 1] = [ gl, gl / 2, gl / 2, gl / 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] expected_grid.loc['-', label1, 1] = [ 0, gl / 2, 0, 0, gl, gl / 2, 0, 0, 0, gl / 2, 0, 0, 0 ] assert np.isclose(expected_grid.values, grid.values).all()
def run_corrupt(fairness_constraints): all_results = {} all_results['eps'] = fairness_constraints all_results['accuracy'] = { 'train': [], 'test': [] } all_results['violation'] = { 'train': [], 'test': [] } all_results['violation_male'] = { 'train': [], 'test': [] } all_results['violation_female'] = { 'train': [], 'test': [] } for eps in fairness_constraints: begin = time.time() print(f"[INFO][RUN] Corrupt") sweep = ExponentiatedGradient(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=EqualizedOdds(), eps=eps) try: sweep.fit(X_train, Y_noised, sensitive_features=A_train) prediction_train = sweep.predict(X_train) prediction_test = sweep.predict(X_test) except: print(f"Fairlearn can't fit at fairness constraint {eps}") pass all_results['accuracy']['train'].append(accuracy(prediction_train, Y_train)) all_results['accuracy']['test'].append(accuracy(prediction_test, Y_test)) all_results['violation']['train'].append(violation(prediction_train, Y_train, A_train)) all_results['violation']['test'].append(violation(prediction_test, Y_test, A_test)) all_results['violation_male']['train'].append(violation(prediction_train, Y_train, A_train, grp=1)) all_results['violation_male']['test'].append(violation(prediction_test, Y_test, A_test, grp=1)) all_results['violation_female']['train'].append(violation(prediction_train, Y_train, A_train, grp=0)) all_results['violation_female']['test'].append(violation(prediction_test, Y_test, A_test, grp=0)) print(f"Running fairness constraint: {eps}, Training Accuracy: {all_results['accuracy']['train'][-1]}, Test Accuracy: {all_results['accuracy']['test'][-1]}, Training Violation: {all_results['violation']['train'][-1]}, Test Violation: {all_results['violation']['test'][-1]}, Time cost: {time.time() - begin}") acc = np.array(all_results['accuracy']['test']) v = np.array(all_results['violation']['test']) all_results['accuracy']['mean'] = acc.mean() all_results['accuracy']['std'] = acc.std() all_results['violation']['mean'] = v.mean() all_results['violation']['std'] = v.std() return all_results
def run(fairness_constraints, use_proxy=False): print(f"Start running experiment with Proxy: {use_proxy}.") all_results = {} all_results['eps'] = fairness_constraints all_results['accuracy'] = {'train': [], 'test': []} all_results['violation'] = {'train': [], 'test': []} all_results['violation_male'] = {'train': [], 'test': []} all_results['violation_female'] = {'train': [], 'test': []} for eps in fairness_constraints: begin = time.time() if use_proxy: sweep = ExponentiatedGradient( LogisticRegression(solver='liblinear', fit_intercept=True), constraints=ProxyEqualizedOdds(error_rate=error_rate), eps=eps) else: sweep = ExponentiatedGradient(LogisticRegression( solver='liblinear', fit_intercept=True), constraints=EqualizedOdds(), eps=eps) try: sweep.fit(X_train, Y_noised, sensitive_features=A_train) prediction_train = sweep.predict(X_train) prediction_test = sweep.predict(X_test) except: print(f"Fairlearn can't fit at fairness constraint {eps}") pass all_results['accuracy']['train'].append( accuracy(prediction_train, Y_train)) all_results['accuracy']['test'].append( accuracy(prediction_test, Y_test)) all_results['violation']['train'].append( violation(prediction_train, Y_train, A_train)) all_results['violation']['test'].append( violation(prediction_test, Y_test, A_test)) all_results['violation_male']['train'].append( violation(prediction_train, Y_train, A_train, grp=1)) all_results['violation_male']['test'].append( violation(prediction_test, Y_test, A_test, grp=1)) all_results['violation_female']['train'].append( violation(prediction_train, Y_train, A_train, grp=0)) all_results['violation_female']['test'].append( violation(prediction_test, Y_test, A_test, grp=0)) print( f"Running fairness constraint: {eps}, Training Accuracy: {all_results['accuracy']['train']}, Test Accuracy: {all_results['accuracy']['test']}, Training Violation: {all_results['violation']['train']}, Test Violation: {all_results['violation']['test']}, Time cost: {time.time() - begin}" ) return all_results
def test_project_lambda_smoke_negatives(): eqo = EqualizedOdds() events = ['label=False', 'label=True'] signs = ['+', '-'] labels = ['a', 'b'] midx = pd.MultiIndex.from_product([signs, events, labels], names=[_SIGN, _EVENT, _GROUP_ID]) df = pd.DataFrame() # Note that the '-' labels are larger df = 0 + pd.Series([1, 2, 11, 19, 1001, 1110, 1230, 1350], index=midx) ls = eqo.project_lambda(df) expected = pd.DataFrame() expected = 0 + pd.Series([0, 0, 0, 0, 1000, 1108, 1219, 1331], index=midx) assert expected.equals(ls)
def test_project_lambda_smoke_positives(): # This is a repeat of the _negatives method but with # the '+' indices larger eqo = EqualizedOdds() events = ['label=False', 'label=True'] signs = ['+', '-'] labels = ['a', 'b'] midx = pd.MultiIndex.from_product([signs, events, labels], names=[_SIGN, _EVENT, _GROUP_ID]) df = pd.DataFrame() # Note that the '-' indices are now smaller df = 0 + pd.Series([200, 300, 100, 600, 4, 5, 6, 7], index=midx) ls = eqo.project_lambda(df) expected = pd.DataFrame() expected = 0 + pd.Series([196, 295, 94, 593, 0, 0, 0, 0], index=midx) assert expected.equals(ls)
def run_clean(fairness_constraints): print(f"Start running experiment with clean data.") unmitigated_predictor = LogisticRegression(solver='liblinear', fit_intercept=True) # unmitigated_predictor.fit(X_train, Y_train) unmitigated_predictor.fit(X_train, Y_train) sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=EqualizedOdds(), grid_size=71) sweep.fit(X_train, Y_train, sensitive_features=A_train) predictors = [unmitigated_predictor ] + [z.predictor for z in sweep.all_results] all_results_train, all_results_test = [], [] for predictor in predictors: prediction_train = predictor.predict(X_train) prediction_test = predictor.predict(X_test) all_results_train.append({ 'accuracy': accuracy(prediction_train, Y_train), 'violation': violation(prediction_train, Y_train, A_train) }) all_results_test.append({ 'accuracy': accuracy(prediction_test, Y_test), 'violation': violation(prediction_test, Y_test, A_test) }) # print(all_results_train) # print(all_results_test) best_train, best_test = [], [] for constraint in fairness_constraints: best = 0.0 for result in all_results_train: if result['violation'] <= constraint and result['accuracy'] > best: best = result['accuracy'] best_train.append(best) best = 0.0 for result in all_results_test: if result['violation'] <= constraint and result['accuracy'] > best: best = result['accuracy'] best_test.append(best) return best_train, best_test
def fit(train: DataTuple, args): """Fit a model.""" try: from fairlearn.reductions import ( ConditionalSelectionRate, DemographicParity, EqualizedOdds, ExponentiatedGradient, ) except ImportError as e: raise RuntimeError( "In order to use Agarwal, install fairlearn==0.4.6.") from e fairness_class: ConditionalSelectionRate if args.fairness == "DP": fairness_class = DemographicParity() else: fairness_class = EqualizedOdds() if args.classifier == "SVM": model = select_svm(C=args.C, kernel=args.kernel, seed=args.seed) else: random_state = np.random.RandomState(seed=args.seed) model = LogisticRegression(solver="liblinear", random_state=random_state, max_iter=5000, C=args.C) data_x = train.x data_y = train.y[train.y.columns[0]] data_a = train.s[train.s.columns[0]] exponentiated_gradient = ExponentiatedGradient(model, constraints=fairness_class, eps=args.eps, T=args.iters) exponentiated_gradient.fit(data_x, data_y, sensitive_features=data_a) min_class_label = train.y[train.y.columns[0]].min() exponentiated_gradient.min_class_label = min_class_label return exponentiated_gradient
def lagrangian(constraint, model, constraint_weight, grid_size, X_train, Y_train, A_train, X_test): """ Conduct lagrangian algorithm and set the base classifier as the black-box estimator to train and predict. """ start_time = datetime.now() if constraint == 'DP': clf = GridSearch(models[model], constraints=DemographicParity(), constraint_weight=constraint_weight, grid_size=grid_size) elif constraint == 'EO': clf = GridSearch(models[model], constraints=EqualizedOdds(), constraint_weight=constraint_weight, grid_size=grid_size) clf.fit(X_train, Y_train, sensitive_features=A_train) Y_pred = clf.predict(X_test) end_time = datetime.now() return Y_pred, time_diff_in_microseconds(end_time - start_time)
def test_random_state_exponentiated_gradient(): """Test that the random_state argument works as expected. This test case reproduces the problem reported in issue 588 if the random_state does not work as intended within Exponentiated Gradient. https://github.com/fairlearn/fairlearn/issues/588 """ X_train, X_test, y_train, y_test, race_train, race_test = _get_test_data() # Train a simple logistic regression model lr = LogisticRegression(max_iter=1000, random_state=0) lr.fit(X_train, y_train) # Train threshold optimizer expgrad = ExponentiatedGradient(estimator=lr, constraints=EqualizedOdds()) expgrad.fit(X_train, y_train, sensitive_features=race_train) # score groups y_pred_test = expgrad.predict(X_test, random_state=0) for _ in range(100): assert (y_pred_test == expgrad.predict(X_test, random_state=0)).all() assert (y_pred_test != expgrad.predict(X_test, random_state=1)).any()
def train_and_predict(train: DataTuple, test: TestTuple, args: AgarwalArgs): """Train a logistic regression model and compute predictions on the given test data.""" random.seed(888) np.random.seed(888) fairness_class: ConditionalSelectionRate if args.fairness == "DP": fairness_class = DemographicParity() else: fairness_class = EqualizedOdds() if args.classifier == "SVM": model = select_svm(args.C, args.kernel) else: model = LogisticRegression(solver="liblinear", random_state=888, max_iter=5000, C=args.C) data_x = train.x data_y = train.y[train.y.columns[0]] data_a = train.s[train.s.columns[0]] exponentiated_gradient = ExponentiatedGradient(model, constraints=fairness_class, eps=args.eps, T=args.iters) exponentiated_gradient.fit(data_x, data_y, sensitive_features=data_a) randomized_predictions = exponentiated_gradient.predict(test.x) preds = pd.DataFrame(randomized_predictions, columns=["preds"]) min_class_label = train.y[train.y.columns[0]].min() if preds["preds"].min() != preds["preds"].max(): preds = preds.replace(preds["preds"].min(), min_class_label) return preds
def setup_method(self, method): self.estimator = LogisticRegression(solver='liblinear') self.disparity_criterion = EqualizedOdds()
def run_peerloss(fairness_constraints, alpha=0.5, est=False): print(f"[INFO][RUN] Peer Loss with alpha = {alpha}") all_results = {} all_results['eps'] = fairness_constraints all_results['accuracy'] = { 'train': [], 'test': [] } all_results['violation'] = { 'train': [], 'test': [] } all_results['violation_male'] = { 'train': [], 'test': [] } all_results['violation_female'] = { 'train': [], 'test': [] } if est: delta = [1 - est_error_rate[i][0] - est_error_rate[i][1] for i in range(len(est_error_rate))] else: delta = [1 - error_rate[i][0] - error_rate[i][1] for i in range(len(error_rate))] for eps in fairness_constraints: begin = time.time() sweep = ExponentiatedGradient(PeerLoss(A_train, delta=delta, alpha=alpha), constraints=EqualizedOdds(), eps=eps) sweep.fit(X_train, Y_noised, sensitive_features=A_train) prediction_train = sweep.predict(X_train) prediction_test = sweep.predict(X_test) all_results['accuracy']['train'].append(accuracy(prediction_train, Y_train)) all_results['accuracy']['test'].append(accuracy(prediction_test, Y_test)) all_results['violation']['train'].append(violation(prediction_train, Y_train, A_train)) all_results['violation']['test'].append(violation(prediction_test, Y_test, A_test)) all_results['violation_male']['train'].append(accuracy(prediction_train, Y_train)) all_results['violation_male']['test'].append(accuracy(prediction_test, Y_test)) all_results['violation_female']['train'].append(accuracy(prediction_train, Y_train)) all_results['violation_female']['test'].append(accuracy(prediction_test, Y_test)) print(f"Running fairness constraint: {eps}, Training Accuracy: {all_results['accuracy']['train'][-1]}, Test Accuracy: {all_results['accuracy']['test'][-1]}, Training Violation: {all_results['violation']['train'][-1]}, Test Violation: {all_results['violation']['test'][-1]}, Time cost: {time.time() - begin}") acc = np.array(all_results['accuracy']['test']) v = np.array(all_results['violation']['test']) all_results['accuracy']['mean'] = acc.mean() all_results['accuracy']['std'] = acc.std() all_results['violation']['mean'] = v.mean() all_results['violation']['std'] = v.std() return all_results
def setup_method(self, method): self.estimator = LogisticRegression(solver="liblinear") self.disparity_criterion = EqualizedOdds() self.sample_weight_name = "sample_weight"
def test_construct_and_load(): eqo = EqualizedOdds() assert eqo.short_name == "EqualizedOdds" # Generate some rigged data num_samples_a0 = 10 num_samples_a1 = 30 num_samples = num_samples_a0 + num_samples_a1 a0_threshold = 0.2 a1_threshold = 0.7 a0_label = "a0" a1_label = "a1" X, Y, A = simple_binary_threshold_data(num_samples_a0, num_samples_a1, a0_threshold, a1_threshold, a0_label, a1_label) # Load up the (rigged) data eqo.load_data(X, Y, sensitive_features=A) assert eqo.data_loaded assert eqo.n == num_samples_a0 + num_samples_a1 # Examine the tags DF assert eqo.tags['label'].equals(pd.Series(Y)) assert eqo.tags['group_id'].equals(pd.Series(A)) expected_tags_event = ['label={0}'.format(a) for a in Y] assert np.array_equal(expected_tags_event, eqo.tags['event']) # Examine the index MultiIndex events = ['label=False', 'label=True'] signs = ['+', '-'] labels = [a0_label, a1_label] expected_index = pd.MultiIndex.from_product( [signs, events, labels], names=[_SIGN, _EVENT, _GROUP_ID]) assert eqo.index.equals(expected_index) # Examine the prob_event DF # There are two events - 'True' and 'False' assert len(eqo.prob_event.index) == 2 assert eqo.prob_event.loc['label=False'] == 1 - sum(Y) / len(Y) assert eqo.prob_event.loc['label=True'] == sum(Y) / len(Y) # Examine the prob_group_event DF # There's only an 'all' event but this records the fractions # of each label in the population assert len(eqo.prob_group_event.index) == 4 # Use the fact that our data are uniformly distributed in the range [0, 1] # With the current values, it appears we don't need to fiddle for off-by-one cases a0_below = a0_threshold * num_samples_a0 a0_above = num_samples_a0 - a0_below assert eqo.prob_group_event.loc[('label=False', a0_label)] == a0_below / num_samples assert eqo.prob_group_event.loc[('label=True', a0_label)] == a0_above / num_samples a1_below = a1_threshold * num_samples_a1 a1_above = num_samples_a1 - a1_below assert eqo.prob_group_event.loc[('label=False', a1_label)] == a1_below / num_samples assert eqo.prob_group_event.loc[('label=True', a1_label)] == a1_above / num_samples # Examine the neg_basis DF assert len(eqo.neg_basis.index) == 8 assert eqo.neg_basis[0]['+', 'label=False', a0_label] == 0 assert eqo.neg_basis[0]['+', 'label=False', a1_label] == 0 assert eqo.neg_basis[0]['+', 'label=True', a0_label] == 0 assert eqo.neg_basis[0]['+', 'label=True', a1_label] == 0 assert eqo.neg_basis[0]['-', 'label=False', a0_label] == 1 assert eqo.neg_basis[0]['-', 'label=False', a1_label] == 0 assert eqo.neg_basis[0]['-', 'label=True', a0_label] == 0 assert eqo.neg_basis[0]['-', 'label=True', a1_label] == 0 # Examine the pos_basis DF # This is looking at the \lambda_{+} values and picking out the # one associated with the first label assert len(eqo.pos_basis.index) == 8 assert eqo.pos_basis[0]['+', 'label=False', a0_label] == 1 assert eqo.pos_basis[0]['+', 'label=False', a1_label] == 0 assert eqo.pos_basis[0]['+', 'label=True', a0_label] == 0 assert eqo.pos_basis[0]['+', 'label=True', a1_label] == 0 assert eqo.pos_basis[0]['-', 'label=False', a0_label] == 0 assert eqo.pos_basis[0]['-', 'label=False', a1_label] == 0 assert eqo.pos_basis[0]['-', 'label=True', a0_label] == 0 assert eqo.pos_basis[0]['-', 'label=True', a1_label] == 0 # Examine the neg_basis_present DF assert len(eqo.neg_basis_present) == 2 assert eqo.neg_basis_present[0] assert eqo.neg_basis_present[1]