def test_equalized_odds():
    # Have to do this one longhand, since it combines tpr and fpr
    X, y = loan_scenario_generator(n, f, sfs, ibs, seed=632753)
    X_dummy = pd.get_dummies(X)

    metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate}

    unmitigated = LogisticRegression()
    unmitigated.fit(X_dummy, y)
    y_pred = unmitigated.predict(X_dummy)
    mf_unmitigated = MetricFrame(
        metrics=metrics,
        y_true=y,
        y_pred=y_pred,
        sensitive_features=X["sens"],
        control_features=X["ctrl"],
    )

    expgrad_basic = ExponentiatedGradient(
        LogisticRegression(),
        constraints=EqualizedOdds(difference_bound=0.01),
        eps=0.01)
    expgrad_basic.fit(X_dummy, y, sensitive_features=X["sens"])
    y_pred_basic = expgrad_basic.predict(X_dummy, random_state=9235)
    mf_basic = MetricFrame(
        metrics=metrics,
        y_true=y,
        y_pred=y_pred_basic,
        sensitive_features=X["sens"],
        control_features=X["ctrl"],
    )

    expgrad_control = ExponentiatedGradient(
        LogisticRegression(),
        constraints=EqualizedOdds(difference_bound=0.01),
        eps=0.01)
    expgrad_control.fit(X_dummy,
                        y,
                        sensitive_features=X["sens"],
                        control_features=X["ctrl"])
    y_pred_control = expgrad_control.predict(X_dummy, random_state=8152)
    mf_control = MetricFrame(
        metrics=metrics,
        y_true=y,
        y_pred=y_pred_control,
        sensitive_features=X["sens"],
        control_features=X["ctrl"],
    )

    compare_unmitigated = mf_control.difference(
        method="to_overall") <= mf_unmitigated.difference(method="to_overall")
    print(compare_unmitigated)

    compare_basic = mf_control.difference(
        method="to_overall") <= mf_basic.difference(method="to_overall")
    print(compare_basic)

    assert compare_basic.values.reshape(6).all()
    assert compare_unmitigated.values.reshape(6).all()
def run_comparisons(moment, metric_fn):
    X, y = loan_scenario_generator(n, f, sfs, ibs, seed=163)
    X_dummy = pd.get_dummies(X)

    mf_input = MetricFrame(metric_fn, y, y,
                           sensitive_features=X['sens'],
                           control_features=X['ctrl'])

    print("Metric for input:\n", mf_input.by_group)
    print("Input Metric differences:\n", mf_input.difference(method='to_overall'), "\n")

    unmitigated = LogisticRegression()
    unmitigated.fit(X_dummy, y)
    y_pred = unmitigated.predict(X_dummy)
    mf_unmitigated = MetricFrame(metric_fn,
                                 y, y_pred,
                                 sensitive_features=X['sens'],
                                 control_features=X['ctrl'])
    print("Unmitigated metric:\n", mf_unmitigated.by_group)
    print("Unmitigated metric differences:\n",
          mf_unmitigated.difference(method='to_overall'), "\n")

    expgrad_basic = ExponentiatedGradient(
        LogisticRegression(),
        constraints=moment(),
        eps=0.005)
    expgrad_basic.fit(X_dummy, y, sensitive_features=X['sens'])
    y_pred_basic = expgrad_basic.predict(X_dummy, random_state=8235)
    mf_basic = MetricFrame(metric_fn, y, y_pred_basic,
                           sensitive_features=X['sens'],
                           control_features=X['ctrl'])
    print("Basic expgrad metric:\n", mf_basic.by_group)
    print("Basic expgrad metric differences:\n",
          mf_basic.difference(method='to_overall'), "\n")

    expgrad_control = ExponentiatedGradient(
        LogisticRegression(),
        constraints=moment(),
        eps=0.005)
    expgrad_control.fit(X_dummy, y,
                        sensitive_features=X['sens'],
                        control_features=X['ctrl'])
    y_pred_control = expgrad_control.predict(X_dummy, random_state=852)
    mf_control = MetricFrame(metric_fn, y, y_pred_control,
                             sensitive_features=X['sens'],
                             control_features=X['ctrl'])
    print("expgrad_control metric:\n", mf_control.by_group)
    print("expgrad_control metric differences:\n",
          mf_control.difference(method='to_overall'))

    assert (mf_control.difference(method='to_overall') <=
            mf_unmitigated.difference(method='to_overall')).all()

    assert (mf_control.difference(method='to_overall') <=
            mf_basic.difference(method='to_overall')).all()
def test_demographic_parity_difference(agg_method):
    actual = demographic_parity_difference(y_t,
                                           y_p,
                                           sensitive_features=g_1,
                                           method=agg_method)

    gm = MetricFrame(selection_rate, y_t, y_p, sensitive_features=g_1)

    assert actual == gm.difference(method=agg_method)
def test_equalized_odds_difference(agg_method):
    actual = equalized_odds_difference(y_t,
                                       y_p,
                                       sensitive_features=g_1,
                                       method=agg_method)

    metrics = {'tpr': true_positive_rate, 'fpr': false_positive_rate}
    gm = MetricFrame(metrics, y_t, y_p, sensitive_features=g_1)

    diffs = gm.difference(method=agg_method)
    assert actual == diffs.max()
def test_demographic_parity_difference_weighted(agg_method):
    actual = demographic_parity_difference(y_t,
                                           y_p,
                                           sensitive_features=g_1,
                                           sample_weight=s_w,
                                           method=agg_method)

    gm = MetricFrame(selection_rate,
                     y_t,
                     y_p,
                     sensitive_features=g_1,
                     sample_params={'sample_weight': s_w})

    assert actual == gm.difference(method=agg_method)
def test_equalized_odds_difference_weighted(agg_method):
    actual = equalized_odds_difference(y_t,
                                       y_p,
                                       sensitive_features=g_1,
                                       method=agg_method,
                                       sample_weight=s_w)

    metrics = {'tpr': true_positive_rate, 'fpr': false_positive_rate}
    sw = {'sample_weight': s_w}
    sp = {'tpr': sw, 'fpr': sw}
    gm = MetricFrame(metrics,
                     y_t,
                     y_p,
                     sensitive_features=g_1,
                     sample_params=sp)

    diffs = gm.difference(method=agg_method)
    assert actual == diffs.max()
Beispiel #7
0
# several means of aggregating metrics across the subgroups, so that disparities
# can be readily quantified.
#
# The simplest of these aggregations is ``group_min()``, which reports the
# minimum value seen for a subgroup for each underlying metric (we also provide
# ``group_max()``). This is
# useful if there is a mandate that "no subgroup should have an ``fbeta_score()``
# of less than 0.6." We can evaluate the minimum values easily:
grouped_on_race.group_min()

# %%
# As noted above, the selection rates varies greatly by race and by sex.
# This can be quantified in terms of a difference between the subgroup with
# the highest value of the metric, and the subgroup with the lowest value.
# For this, we provide the method ``difference(method='between_groups)``:
grouped_on_race.difference(method='between_groups')

# %%
# We can also evaluate the difference relative to the corresponding overall
# value of the metric. In this case we take the absolute value, so that the
# result is always positive:
grouped_on_race.difference(method='to_overall')

# %%
# There are situations where knowing the ratios of the metrics evaluated on
# the subgroups is more useful. For this we have the ``ratio()`` method.
# We can take the ratios between the minimum and maximum values of each metric:
grouped_on_race.ratio(method='between_groups')

# %%
# We can also compute the ratios relative to the overall value for each
Beispiel #8
0
def test(args, model, device, test_loader, test_size, sensitive_idx):

    model.eval()
    criterion = nn.BCELoss()
    test_loss = 0
    correct = 0
    i = 0

    avg_recall = 0
    avg_precision = 0
    overall_results = []
    avg_eq_odds = 0
    avg_dem_par = 0
    avg_tpr = 0
    avg_tp = 0
    avg_tn = 0
    avg_fp = 0
    avg_fn = 0
    with torch.no_grad():
        for cats, conts, target in tqdm(test_loader):
            print("*********")
            #i += 1
            cats, conts, target = cats.to(device), conts.to(device), target.to(device)


            output = model(cats, conts)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = (output > 0.5).float()
            correct += pred.eq(target.view_as(pred)).sum().item()

            curr_datetime = datetime.now()
            curr_hour = curr_datetime.hour
            curr_min = curr_datetime.minute

            pred_df = pd.DataFrame(pred.numpy())
            pred_df.to_csv(f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv")

            # confusion matrixç
            tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
            avg_tn+=tn
            avg_fp+=fp
            avg_fn+=fn
            avg_tp+=tp

            # position of col for sensitive values
            sensitive = [i[sensitive_idx].item() for i in cats]
            cat_len = max(sensitive)
            print(cat_len)
            #exit()
            sub_cm = []
            #print(cat_len)
            for j in range(cat_len+1):
                try:
                    idx = list(locate(sensitive, lambda x: x == j))
                    sub_tar = target[idx]
                    sub_pred = pred[idx]
                    sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix(sub_tar, sub_pred).ravel()
                except:
                    # when only one value to predict
                    temp_tar = int(sub_tar.numpy()[0])
                    temp_pred = int(sub_pred.numpy()[0])
                    #print(tar, pred)
                    if temp_tar and temp_pred:
                        sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1
                    elif temp_tar and not temp_pred:
                        sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0
                    elif not temp_tar and not temp_pred:
                        sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0
                    elif not temp_tar and temp_pred:
                        sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0
                    else:
                        sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0

                total = mysum(sub_tn, sub_fp, sub_fn, sub_tp)
                sub_cm.append((sub_tn/total, sub_fp/total, sub_fn/total, sub_tp/total))

            # Fairness metrics
            group_metrics = MetricFrame({'precision': skm.precision_score, 'recall': skm.recall_score},
                                        target, pred,
                                        sensitive_features=sensitive)

            demographic_parity = flm.demographic_parity_difference(target, pred,
                                                                   sensitive_features=sensitive)

            eq_odds = flm.equalized_odds_difference(target, pred,
                                                    sensitive_features=sensitive)

            # metric_fns = {'true_positive_rate': true_positive_rate}

            tpr = MetricFrame(true_positive_rate,
                              target, pred,
                              sensitive_features=sensitive)

            # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive)
            sub_results = group_metrics.overall.to_dict()
            sub_results_by_group = group_metrics.by_group.to_dict()

            #print("\n", group_metrics.by_group, "\n")
            avg_precision += sub_results['precision']
            avg_recall += sub_results['recall']
            overall_results.append(sub_results_by_group)
            avg_eq_odds += eq_odds
            avg_dem_par += demographic_parity
            avg_tpr += tpr.difference(method='between_groups')

    print(i)
    total = mysum(avg_tn, avg_fp, avg_fn, avg_tp)
    cm = (avg_tn/total, avg_fp/total, avg_fn/total, avg_tp/total)
    test_loss /= test_size
    accuracy = correct / test_size
    avg_loss = test_loss


    return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
Beispiel #9
0
# parameters. Consider :func:`sklearn.metrics.fbeta_score`, which
# has a required :code:`beta=` argument (and suppose that this time
# we are most interested in the maximum difference to the overall value).
# First we evaluate this with a :class:`fairlearn.metrics.MetricFrame`:

fbeta_03 = functools.partial(skm.fbeta_score, beta=0.3)
fbeta_03.__name__ = "fbeta_score__beta_0.3"

beta_frame = MetricFrame(
    metrics=fbeta_03,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=A_test["sex"],
    sample_params={"sample_weight": random_weights},
)
beta_from_frame = beta_frame.difference(method="to_overall")

print("From frame:", beta_from_frame)

# %%
# And next, we create a function to evaluate the same. Note that
# we do not need to use :func:`functools.partial` to bind the
# :code:`beta=` argument:

beta_func = make_derived_metric(metric=skm.fbeta_score, transform="difference")

beta_from_func = beta_func(
    y_test,
    y_pred,
    sensitive_features=A_test["sex"],
    beta=0.3,
def test_student(args, student_train_loader, student_labels,
                 student_test_loader, test_size, cat_emb_size, num_conts,
                 device, sensitive_idx):
    student_model = RandomForestClassifier(random_state=42,
                                           warm_start=True,
                                           n_estimators=100)

    print("========== Testing Student Model ==========")
    for epoch in range(args.epochs):
        train_loader = student_loader(student_train_loader, student_labels)
        for (cats, conts), labels in train_loader:
            X = torch.cat((cats, conts), 1)
            student_model = student_model.fit(X, labels)

            test_loss = 0
            correct = 0
            i = 0

            avg_recall = 0
            avg_precision = 0
            overall_results = []
            avg_eq_odds = 0
            avg_dem_par = 0
            avg_tpr = 0
            avg_tp = 0
            avg_tn = 0
            avg_fp = 0
            avg_fn = 0

            with torch.no_grad():
                for batch_idx, (cats, conts,
                                target) in enumerate(student_test_loader):
                    print("target\n", sum(target))
                    i += 1
                    X = torch.cat((cats, conts), 1)
                    output = student_model.predict(X)
                    output = torch.from_numpy(output)
                    pred = (output > 0.5).float()
                    print("pred\n", sum(pred))
                    correct += pred.eq(target.view_as(pred)).sum().item()

                    curr_datetime = datetime.now()
                    curr_hour = curr_datetime.hour
                    curr_min = curr_datetime.minute

                    pred_df = pd.DataFrame(pred.numpy())
                    pred_df.to_csv(
                        f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv"
                    )

                    #print(pred, np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy()))
                    #correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
                    #total += cats.size(0)

                    # confusion matrixç
                    tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
                    avg_tn += tn
                    avg_fp += fp
                    avg_fn += fn
                    avg_tp += tp

                    # position of col for sensitive values
                    sensitive = [i[sensitive_idx].item() for i in cats]
                    cat_len = max(sensitive)

                    #exit()
                    sub_cm = []
                    # print(cat_len)
                    for j in range(cat_len + 1):
                        try:
                            idx = list(locate(sensitive, lambda x: x == j))
                            sub_tar = target[idx]
                            sub_pred = pred[idx]
                            sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix(
                                sub_tar, sub_pred).ravel()
                        except:
                            # when only one value to predict
                            temp_tar = int(sub_tar.numpy()[0])
                            temp_pred = int(sub_pred.numpy()[0])
                            # print(tar, pred)
                            if temp_tar and temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1
                            elif temp_tar and not temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0
                            elif not temp_tar and not temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0
                            elif not temp_tar and temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0
                            else:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0

                        total = mysum(sub_tn, sub_fp, sub_fn, sub_tp)
                        print("??", total)
                        sub_cm.append((sub_tn / total, sub_fp / total,
                                       sub_fn / total, sub_tp / total))

                    # Fairness metrics

                    group_metrics = MetricFrame(
                        {
                            'precision': skm.precision_score,
                            'recall': skm.recall_score
                        },
                        target,
                        pred,
                        sensitive_features=sensitive)

                    print(target)
                    print(pred)
                    demographic_parity = flm.demographic_parity_difference(
                        target, pred, sensitive_features=sensitive)

                    eq_odds = flm.equalized_odds_difference(
                        target, pred, sensitive_features=sensitive)

                    # metric_fns = {'true_positive_rate': true_positive_rate}

                    tpr = MetricFrame(true_positive_rate,
                                      target,
                                      pred,
                                      sensitive_features=sensitive)

                    # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive)
                    sub_results = group_metrics.overall.to_dict()
                    sub_results_by_group = group_metrics.by_group.to_dict()

                    # print("\n", group_metrics.by_group, "\n")
                    avg_precision += sub_results['precision']
                    avg_recall += sub_results['recall']
                    print("pre_rec", sub_results)
                    overall_results.append(sub_results_by_group)
                    avg_eq_odds += eq_odds
                    print("eqo", eq_odds)
                    avg_dem_par += demographic_parity
                    print("dempar", demographic_parity)
                    avg_tpr += tpr.difference(method='between_groups')
                    print("tpr", tpr.difference(method='between_groups'))

            total = mysum(avg_tn, avg_fp, avg_fn, avg_tp)
            print("!!", total)
            cm = (avg_tn / total, avg_fp / total, avg_fn / total,
                  avg_tp / total)
            test_loss /= test_size
            accuracy = correct / test_size
            avg_loss = test_loss

            return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
def test_student(args, student_train_loader, student_labels, student_test_loader, test_size, cat_emb_size, num_conts, device, sensitive_idx):
    student_model = RegressionModel(emb_szs=cat_emb_size,
                    n_cont=num_conts,
                    emb_drop=0.04,
                    out_sz=1,
                    szs=[1000, 500, 250],
                    drops=[0.001, 0.01, 0.01],
                    y_range=(0, 1)).to(device)

    criterion = nn.BCELoss()
    optimizer = optim.SGD(student_model.parameters(), lr=args.lr, momentum=0)
    steps = 0
    running_loss = 0
    correct = 0
    print("========== Testing Student Model ==========")
    for epoch in range(args.epochs):
        student_model.train()
        train_loader = student_loader(student_train_loader, student_labels)
        for (cats, conts) , labels in train_loader:
        #for _batch_idx, (data, target) in enumerate(tqdm(train_loader)):
            #cats = data[0]
            #conts = data[1]
            steps += 1

            optimizer.zero_grad()
            output = student_model(cats, conts).view(-1)
            labels = labels.to(torch.float32)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        #            if steps % 50 == 0:
            student_model.eval()
            test_loss = 0
            correct = 0
            i = 0

            avg_recall = 0
            avg_precision = 0
            overall_results = []
            avg_eq_odds = 0
            avg_dem_par = 0
            avg_tpr = 0
            avg_tp = 0
            avg_tn = 0
            avg_fp = 0
            avg_fn = 0

            with torch.no_grad():
                for batch_idx, (cats, conts, target) in enumerate(student_test_loader):
                    print("target\n", sum(target))
                    i+=1
                    output = student_model(cats, conts)
                    loss += criterion(output, target).item()
                    test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))
                    pred = (output > 0.5).float()
                    print("pred\n", sum(pred))
                    correct += pred.eq(target.view_as(pred)).sum().item()

                    curr_datetime = datetime.now()
                    curr_hour = curr_datetime.hour
                    curr_min = curr_datetime.minute

                    pred_df = pd.DataFrame(pred.numpy())
                    pred_df.to_csv(f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv")

                    #print(pred, np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy()))
                    #correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
                    #total += cats.size(0)


                    # confusion matrixç
                    tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
                    avg_tn += tn
                    avg_fp += fp
                    avg_fn += fn
                    avg_tp += tp

                    # position of col for sensitive values
                    sensitive = [i[sensitive_idx].item() for i in cats]
                    cat_len = max(sensitive)

                    #exit()
                    sub_cm = []
                    # print(cat_len)
                    for j in range(cat_len+1):
                        try:
                            idx = list(locate(sensitive, lambda x: x == j))
                            sub_tar = target[idx]
                            sub_pred = pred[idx]
                            sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix(sub_tar, sub_pred).ravel()
                        except:
                            # when only one value to predict
                            print("----WHAT?")
                            temp_tar = int(sub_tar.numpy()[0])
                            temp_pred = int(sub_pred.numpy()[0])
                            # print(tar, pred)
                            if temp_tar and temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1
                            elif temp_tar and not temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0
                            elif not temp_tar and not temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0
                            elif not temp_tar and temp_pred:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0
                            else:
                                sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0

                        total = mysum(sub_tn, sub_fp, sub_fn, sub_tp)
                        print("??", total)
                        sub_cm.append((sub_tn / total, sub_fp / total, sub_fn / total, sub_tp / total))

                    # Fairness metrics

                    group_metrics = MetricFrame({'precision': skm.precision_score, 'recall': skm.recall_score},
                                                target, pred,
                                                sensitive_features=sensitive)


                    demographic_parity = flm.demographic_parity_difference(target, pred,
                                                                           sensitive_features=sensitive)

                    eq_odds = flm.equalized_odds_difference(target, pred,
                                                            sensitive_features=sensitive)

                    # metric_fns = {'true_positive_rate': true_positive_rate}

                    tpr = MetricFrame(true_positive_rate,
                                      target, pred,
                                      sensitive_features=sensitive)

                    # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive)
                    sub_results = group_metrics.overall.to_dict()
                    sub_results_by_group = group_metrics.by_group.to_dict()

                    # print("\n", group_metrics.by_group, "\n")
                    avg_precision += sub_results['precision']
                    avg_recall += sub_results['recall']
                    print("pre_rec", sub_results)
                    overall_results.append(sub_results_by_group)
                    avg_eq_odds += eq_odds
                    print("eqo", eq_odds)
                    avg_dem_par += demographic_parity
                    print("dempar", demographic_parity)
                    avg_tpr += tpr.difference(method='between_groups')
                    print("tpr", tpr.difference(method='between_groups'))

            total = mysum(avg_tn, avg_fp, avg_fn, avg_tp)
            print("!!", total)
            cm = (avg_tn / total, avg_fp / total, avg_fn / total, avg_tp / total)
            test_loss /= test_size
            accuracy = correct / test_size
            avg_loss = test_loss

            return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
# several means of aggregating metrics across the subgroups, so that disparities
# can be readily quantified.
#
# The simplest of these aggregations is ``group_min()``, which reports the
# minimum value seen for a subgroup for each underlying metric (we also provide
# ``group_max()``). This is
# useful if there is a mandate that "no subgroup should have an ``fbeta_score()``
# of less than 0.6." We can evaluate the minimum values easily:
grouped_on_race.group_min()

# %%
# As noted above, the selection rates varies greatly by race and by sex.
# This can be quantified in terms of a difference between the subgroup with
# the highest value of the metric, and the subgroup with the lowest value.
# For this, we provide the method ``difference(method='between_groups)``:
grouped_on_race.difference(method="between_groups")

# %%
# We can also evaluate the difference relative to the corresponding overall
# value of the metric. In this case we take the absolute value, so that the
# result is always positive:
grouped_on_race.difference(method="to_overall")

# %%
# There are situations where knowing the ratios of the metrics evaluated on
# the subgroups is more useful. For this we have the ``ratio()`` method.
# We can take the ratios between the minimum and maximum values of each metric:
grouped_on_race.ratio(method="between_groups")

# %%
# We can also compute the ratios relative to the overall value for each
Beispiel #13
0
            "selection_rate": selection_rate,
            "count": count,
        },
        sensitive_features=A_test,
        y_true=Y_test,
        y_pred=predictions[key],
    )

import matplotlib.pyplot as plt

x = [
    metric_frame.overall["accuracy"]
    for metric_frame in metric_frames.values()
]
y = [
    metric_frame.difference()["selection_rate"]
    for metric_frame in metric_frames.values()
]
keys = list(metric_frames.keys())
plt.scatter(x, y)
for i in range(len(x)):
    plt.annotate(keys[i], (x[i] + 0.0003, y[i]))
plt.xlabel("accuracy")
plt.ylabel("selection rate difference")

# %%
# We see a Pareto front forming - the set of predictors which represent optimal
# tradeoffs between accuracy and disparity in predictions. In the ideal case,
# we would have a predictor at (1,0) - perfectly accurate and without any
# unfairness under demographic parity (with respect to the sensitive feature
# "sex"). The Pareto front represents the closest we can come to this ideal