def test_equalized_odds(): # Have to do this one longhand, since it combines tpr and fpr X, y = loan_scenario_generator(n, f, sfs, ibs, seed=632753) X_dummy = pd.get_dummies(X) metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate} unmitigated = LogisticRegression() unmitigated.fit(X_dummy, y) y_pred = unmitigated.predict(X_dummy) mf_unmitigated = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred, sensitive_features=X["sens"], control_features=X["ctrl"], ) expgrad_basic = ExponentiatedGradient( LogisticRegression(), constraints=EqualizedOdds(difference_bound=0.01), eps=0.01) expgrad_basic.fit(X_dummy, y, sensitive_features=X["sens"]) y_pred_basic = expgrad_basic.predict(X_dummy, random_state=9235) mf_basic = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred_basic, sensitive_features=X["sens"], control_features=X["ctrl"], ) expgrad_control = ExponentiatedGradient( LogisticRegression(), constraints=EqualizedOdds(difference_bound=0.01), eps=0.01) expgrad_control.fit(X_dummy, y, sensitive_features=X["sens"], control_features=X["ctrl"]) y_pred_control = expgrad_control.predict(X_dummy, random_state=8152) mf_control = MetricFrame( metrics=metrics, y_true=y, y_pred=y_pred_control, sensitive_features=X["sens"], control_features=X["ctrl"], ) compare_unmitigated = mf_control.difference( method="to_overall") <= mf_unmitigated.difference(method="to_overall") print(compare_unmitigated) compare_basic = mf_control.difference( method="to_overall") <= mf_basic.difference(method="to_overall") print(compare_basic) assert compare_basic.values.reshape(6).all() assert compare_unmitigated.values.reshape(6).all()
def run_comparisons(moment, metric_fn): X, y = loan_scenario_generator(n, f, sfs, ibs, seed=163) X_dummy = pd.get_dummies(X) mf_input = MetricFrame(metric_fn, y, y, sensitive_features=X['sens'], control_features=X['ctrl']) print("Metric for input:\n", mf_input.by_group) print("Input Metric differences:\n", mf_input.difference(method='to_overall'), "\n") unmitigated = LogisticRegression() unmitigated.fit(X_dummy, y) y_pred = unmitigated.predict(X_dummy) mf_unmitigated = MetricFrame(metric_fn, y, y_pred, sensitive_features=X['sens'], control_features=X['ctrl']) print("Unmitigated metric:\n", mf_unmitigated.by_group) print("Unmitigated metric differences:\n", mf_unmitigated.difference(method='to_overall'), "\n") expgrad_basic = ExponentiatedGradient( LogisticRegression(), constraints=moment(), eps=0.005) expgrad_basic.fit(X_dummy, y, sensitive_features=X['sens']) y_pred_basic = expgrad_basic.predict(X_dummy, random_state=8235) mf_basic = MetricFrame(metric_fn, y, y_pred_basic, sensitive_features=X['sens'], control_features=X['ctrl']) print("Basic expgrad metric:\n", mf_basic.by_group) print("Basic expgrad metric differences:\n", mf_basic.difference(method='to_overall'), "\n") expgrad_control = ExponentiatedGradient( LogisticRegression(), constraints=moment(), eps=0.005) expgrad_control.fit(X_dummy, y, sensitive_features=X['sens'], control_features=X['ctrl']) y_pred_control = expgrad_control.predict(X_dummy, random_state=852) mf_control = MetricFrame(metric_fn, y, y_pred_control, sensitive_features=X['sens'], control_features=X['ctrl']) print("expgrad_control metric:\n", mf_control.by_group) print("expgrad_control metric differences:\n", mf_control.difference(method='to_overall')) assert (mf_control.difference(method='to_overall') <= mf_unmitigated.difference(method='to_overall')).all() assert (mf_control.difference(method='to_overall') <= mf_basic.difference(method='to_overall')).all()
def test_demographic_parity_difference(agg_method): actual = demographic_parity_difference(y_t, y_p, sensitive_features=g_1, method=agg_method) gm = MetricFrame(selection_rate, y_t, y_p, sensitive_features=g_1) assert actual == gm.difference(method=agg_method)
def test_equalized_odds_difference(agg_method): actual = equalized_odds_difference(y_t, y_p, sensitive_features=g_1, method=agg_method) metrics = {'tpr': true_positive_rate, 'fpr': false_positive_rate} gm = MetricFrame(metrics, y_t, y_p, sensitive_features=g_1) diffs = gm.difference(method=agg_method) assert actual == diffs.max()
def test_demographic_parity_difference_weighted(agg_method): actual = demographic_parity_difference(y_t, y_p, sensitive_features=g_1, sample_weight=s_w, method=agg_method) gm = MetricFrame(selection_rate, y_t, y_p, sensitive_features=g_1, sample_params={'sample_weight': s_w}) assert actual == gm.difference(method=agg_method)
def test_equalized_odds_difference_weighted(agg_method): actual = equalized_odds_difference(y_t, y_p, sensitive_features=g_1, method=agg_method, sample_weight=s_w) metrics = {'tpr': true_positive_rate, 'fpr': false_positive_rate} sw = {'sample_weight': s_w} sp = {'tpr': sw, 'fpr': sw} gm = MetricFrame(metrics, y_t, y_p, sensitive_features=g_1, sample_params=sp) diffs = gm.difference(method=agg_method) assert actual == diffs.max()
# several means of aggregating metrics across the subgroups, so that disparities # can be readily quantified. # # The simplest of these aggregations is ``group_min()``, which reports the # minimum value seen for a subgroup for each underlying metric (we also provide # ``group_max()``). This is # useful if there is a mandate that "no subgroup should have an ``fbeta_score()`` # of less than 0.6." We can evaluate the minimum values easily: grouped_on_race.group_min() # %% # As noted above, the selection rates varies greatly by race and by sex. # This can be quantified in terms of a difference between the subgroup with # the highest value of the metric, and the subgroup with the lowest value. # For this, we provide the method ``difference(method='between_groups)``: grouped_on_race.difference(method='between_groups') # %% # We can also evaluate the difference relative to the corresponding overall # value of the metric. In this case we take the absolute value, so that the # result is always positive: grouped_on_race.difference(method='to_overall') # %% # There are situations where knowing the ratios of the metrics evaluated on # the subgroups is more useful. For this we have the ``ratio()`` method. # We can take the ratios between the minimum and maximum values of each metric: grouped_on_race.ratio(method='between_groups') # %% # We can also compute the ratios relative to the overall value for each
def test(args, model, device, test_loader, test_size, sensitive_idx): model.eval() criterion = nn.BCELoss() test_loss = 0 correct = 0 i = 0 avg_recall = 0 avg_precision = 0 overall_results = [] avg_eq_odds = 0 avg_dem_par = 0 avg_tpr = 0 avg_tp = 0 avg_tn = 0 avg_fp = 0 avg_fn = 0 with torch.no_grad(): for cats, conts, target in tqdm(test_loader): print("*********") #i += 1 cats, conts, target = cats.to(device), conts.to(device), target.to(device) output = model(cats, conts) test_loss += criterion(output, target).item() # sum up batch loss pred = (output > 0.5).float() correct += pred.eq(target.view_as(pred)).sum().item() curr_datetime = datetime.now() curr_hour = curr_datetime.hour curr_min = curr_datetime.minute pred_df = pd.DataFrame(pred.numpy()) pred_df.to_csv(f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv") # confusion matrixç tn, fp, fn, tp = confusion_matrix(target, pred).ravel() avg_tn+=tn avg_fp+=fp avg_fn+=fn avg_tp+=tp # position of col for sensitive values sensitive = [i[sensitive_idx].item() for i in cats] cat_len = max(sensitive) print(cat_len) #exit() sub_cm = [] #print(cat_len) for j in range(cat_len+1): try: idx = list(locate(sensitive, lambda x: x == j)) sub_tar = target[idx] sub_pred = pred[idx] sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix(sub_tar, sub_pred).ravel() except: # when only one value to predict temp_tar = int(sub_tar.numpy()[0]) temp_pred = int(sub_pred.numpy()[0]) #print(tar, pred) if temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1 elif temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0 elif not temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0 elif not temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0 else: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0 total = mysum(sub_tn, sub_fp, sub_fn, sub_tp) sub_cm.append((sub_tn/total, sub_fp/total, sub_fn/total, sub_tp/total)) # Fairness metrics group_metrics = MetricFrame({'precision': skm.precision_score, 'recall': skm.recall_score}, target, pred, sensitive_features=sensitive) demographic_parity = flm.demographic_parity_difference(target, pred, sensitive_features=sensitive) eq_odds = flm.equalized_odds_difference(target, pred, sensitive_features=sensitive) # metric_fns = {'true_positive_rate': true_positive_rate} tpr = MetricFrame(true_positive_rate, target, pred, sensitive_features=sensitive) # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive) sub_results = group_metrics.overall.to_dict() sub_results_by_group = group_metrics.by_group.to_dict() #print("\n", group_metrics.by_group, "\n") avg_precision += sub_results['precision'] avg_recall += sub_results['recall'] overall_results.append(sub_results_by_group) avg_eq_odds += eq_odds avg_dem_par += demographic_parity avg_tpr += tpr.difference(method='between_groups') print(i) total = mysum(avg_tn, avg_fp, avg_fn, avg_tp) cm = (avg_tn/total, avg_fp/total, avg_fn/total, avg_tp/total) test_loss /= test_size accuracy = correct / test_size avg_loss = test_loss return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
# parameters. Consider :func:`sklearn.metrics.fbeta_score`, which # has a required :code:`beta=` argument (and suppose that this time # we are most interested in the maximum difference to the overall value). # First we evaluate this with a :class:`fairlearn.metrics.MetricFrame`: fbeta_03 = functools.partial(skm.fbeta_score, beta=0.3) fbeta_03.__name__ = "fbeta_score__beta_0.3" beta_frame = MetricFrame( metrics=fbeta_03, y_true=y_test, y_pred=y_pred, sensitive_features=A_test["sex"], sample_params={"sample_weight": random_weights}, ) beta_from_frame = beta_frame.difference(method="to_overall") print("From frame:", beta_from_frame) # %% # And next, we create a function to evaluate the same. Note that # we do not need to use :func:`functools.partial` to bind the # :code:`beta=` argument: beta_func = make_derived_metric(metric=skm.fbeta_score, transform="difference") beta_from_func = beta_func( y_test, y_pred, sensitive_features=A_test["sex"], beta=0.3,
def test_student(args, student_train_loader, student_labels, student_test_loader, test_size, cat_emb_size, num_conts, device, sensitive_idx): student_model = RandomForestClassifier(random_state=42, warm_start=True, n_estimators=100) print("========== Testing Student Model ==========") for epoch in range(args.epochs): train_loader = student_loader(student_train_loader, student_labels) for (cats, conts), labels in train_loader: X = torch.cat((cats, conts), 1) student_model = student_model.fit(X, labels) test_loss = 0 correct = 0 i = 0 avg_recall = 0 avg_precision = 0 overall_results = [] avg_eq_odds = 0 avg_dem_par = 0 avg_tpr = 0 avg_tp = 0 avg_tn = 0 avg_fp = 0 avg_fn = 0 with torch.no_grad(): for batch_idx, (cats, conts, target) in enumerate(student_test_loader): print("target\n", sum(target)) i += 1 X = torch.cat((cats, conts), 1) output = student_model.predict(X) output = torch.from_numpy(output) pred = (output > 0.5).float() print("pred\n", sum(pred)) correct += pred.eq(target.view_as(pred)).sum().item() curr_datetime = datetime.now() curr_hour = curr_datetime.hour curr_min = curr_datetime.minute pred_df = pd.DataFrame(pred.numpy()) pred_df.to_csv( f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv" ) #print(pred, np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())) #correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy()) #total += cats.size(0) # confusion matrixç tn, fp, fn, tp = confusion_matrix(target, pred).ravel() avg_tn += tn avg_fp += fp avg_fn += fn avg_tp += tp # position of col for sensitive values sensitive = [i[sensitive_idx].item() for i in cats] cat_len = max(sensitive) #exit() sub_cm = [] # print(cat_len) for j in range(cat_len + 1): try: idx = list(locate(sensitive, lambda x: x == j)) sub_tar = target[idx] sub_pred = pred[idx] sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix( sub_tar, sub_pred).ravel() except: # when only one value to predict temp_tar = int(sub_tar.numpy()[0]) temp_pred = int(sub_pred.numpy()[0]) # print(tar, pred) if temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1 elif temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0 elif not temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0 elif not temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0 else: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0 total = mysum(sub_tn, sub_fp, sub_fn, sub_tp) print("??", total) sub_cm.append((sub_tn / total, sub_fp / total, sub_fn / total, sub_tp / total)) # Fairness metrics group_metrics = MetricFrame( { 'precision': skm.precision_score, 'recall': skm.recall_score }, target, pred, sensitive_features=sensitive) print(target) print(pred) demographic_parity = flm.demographic_parity_difference( target, pred, sensitive_features=sensitive) eq_odds = flm.equalized_odds_difference( target, pred, sensitive_features=sensitive) # metric_fns = {'true_positive_rate': true_positive_rate} tpr = MetricFrame(true_positive_rate, target, pred, sensitive_features=sensitive) # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive) sub_results = group_metrics.overall.to_dict() sub_results_by_group = group_metrics.by_group.to_dict() # print("\n", group_metrics.by_group, "\n") avg_precision += sub_results['precision'] avg_recall += sub_results['recall'] print("pre_rec", sub_results) overall_results.append(sub_results_by_group) avg_eq_odds += eq_odds print("eqo", eq_odds) avg_dem_par += demographic_parity print("dempar", demographic_parity) avg_tpr += tpr.difference(method='between_groups') print("tpr", tpr.difference(method='between_groups')) total = mysum(avg_tn, avg_fp, avg_fn, avg_tp) print("!!", total) cm = (avg_tn / total, avg_fp / total, avg_fn / total, avg_tp / total) test_loss /= test_size accuracy = correct / test_size avg_loss = test_loss return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
def test_student(args, student_train_loader, student_labels, student_test_loader, test_size, cat_emb_size, num_conts, device, sensitive_idx): student_model = RegressionModel(emb_szs=cat_emb_size, n_cont=num_conts, emb_drop=0.04, out_sz=1, szs=[1000, 500, 250], drops=[0.001, 0.01, 0.01], y_range=(0, 1)).to(device) criterion = nn.BCELoss() optimizer = optim.SGD(student_model.parameters(), lr=args.lr, momentum=0) steps = 0 running_loss = 0 correct = 0 print("========== Testing Student Model ==========") for epoch in range(args.epochs): student_model.train() train_loader = student_loader(student_train_loader, student_labels) for (cats, conts) , labels in train_loader: #for _batch_idx, (data, target) in enumerate(tqdm(train_loader)): #cats = data[0] #conts = data[1] steps += 1 optimizer.zero_grad() output = student_model(cats, conts).view(-1) labels = labels.to(torch.float32) loss = criterion(output, labels) loss.backward() optimizer.step() running_loss += loss.item() # if steps % 50 == 0: student_model.eval() test_loss = 0 correct = 0 i = 0 avg_recall = 0 avg_precision = 0 overall_results = [] avg_eq_odds = 0 avg_dem_par = 0 avg_tpr = 0 avg_tp = 0 avg_tn = 0 avg_fp = 0 avg_fn = 0 with torch.no_grad(): for batch_idx, (cats, conts, target) in enumerate(student_test_loader): print("target\n", sum(target)) i+=1 output = student_model(cats, conts) loss += criterion(output, target).item() test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss)) pred = (output > 0.5).float() print("pred\n", sum(pred)) correct += pred.eq(target.view_as(pred)).sum().item() curr_datetime = datetime.now() curr_hour = curr_datetime.hour curr_min = curr_datetime.minute pred_df = pd.DataFrame(pred.numpy()) pred_df.to_csv(f"pred_results/{args.run_name}_{curr_hour}-{curr_min}.csv") #print(pred, np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())) #correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy()) #total += cats.size(0) # confusion matrixç tn, fp, fn, tp = confusion_matrix(target, pred).ravel() avg_tn += tn avg_fp += fp avg_fn += fn avg_tp += tp # position of col for sensitive values sensitive = [i[sensitive_idx].item() for i in cats] cat_len = max(sensitive) #exit() sub_cm = [] # print(cat_len) for j in range(cat_len+1): try: idx = list(locate(sensitive, lambda x: x == j)) sub_tar = target[idx] sub_pred = pred[idx] sub_tn, sub_fp, sub_fn, sub_tp = confusion_matrix(sub_tar, sub_pred).ravel() except: # when only one value to predict print("----WHAT?") temp_tar = int(sub_tar.numpy()[0]) temp_pred = int(sub_pred.numpy()[0]) # print(tar, pred) if temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 1 elif temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 1, 0 elif not temp_tar and not temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 1, 0, 0, 0 elif not temp_tar and temp_pred: sub_tn, sub_fp, sub_fn, sub_tp = 0, 1, 0, 0 else: sub_tn, sub_fp, sub_fn, sub_tp = 0, 0, 0, 0 total = mysum(sub_tn, sub_fp, sub_fn, sub_tp) print("??", total) sub_cm.append((sub_tn / total, sub_fp / total, sub_fn / total, sub_tp / total)) # Fairness metrics group_metrics = MetricFrame({'precision': skm.precision_score, 'recall': skm.recall_score}, target, pred, sensitive_features=sensitive) demographic_parity = flm.demographic_parity_difference(target, pred, sensitive_features=sensitive) eq_odds = flm.equalized_odds_difference(target, pred, sensitive_features=sensitive) # metric_fns = {'true_positive_rate': true_positive_rate} tpr = MetricFrame(true_positive_rate, target, pred, sensitive_features=sensitive) # tpr = flm.true_positive_rate(target, pred,sample_weight=sensitive) sub_results = group_metrics.overall.to_dict() sub_results_by_group = group_metrics.by_group.to_dict() # print("\n", group_metrics.by_group, "\n") avg_precision += sub_results['precision'] avg_recall += sub_results['recall'] print("pre_rec", sub_results) overall_results.append(sub_results_by_group) avg_eq_odds += eq_odds print("eqo", eq_odds) avg_dem_par += demographic_parity print("dempar", demographic_parity) avg_tpr += tpr.difference(method='between_groups') print("tpr", tpr.difference(method='between_groups')) total = mysum(avg_tn, avg_fp, avg_fn, avg_tp) print("!!", total) cm = (avg_tn / total, avg_fp / total, avg_fn / total, avg_tp / total) test_loss /= test_size accuracy = correct / test_size avg_loss = test_loss return accuracy, avg_loss, avg_precision, avg_recall, avg_eq_odds, avg_tpr, avg_dem_par, cm, sub_cm, overall_results
# several means of aggregating metrics across the subgroups, so that disparities # can be readily quantified. # # The simplest of these aggregations is ``group_min()``, which reports the # minimum value seen for a subgroup for each underlying metric (we also provide # ``group_max()``). This is # useful if there is a mandate that "no subgroup should have an ``fbeta_score()`` # of less than 0.6." We can evaluate the minimum values easily: grouped_on_race.group_min() # %% # As noted above, the selection rates varies greatly by race and by sex. # This can be quantified in terms of a difference between the subgroup with # the highest value of the metric, and the subgroup with the lowest value. # For this, we provide the method ``difference(method='between_groups)``: grouped_on_race.difference(method="between_groups") # %% # We can also evaluate the difference relative to the corresponding overall # value of the metric. In this case we take the absolute value, so that the # result is always positive: grouped_on_race.difference(method="to_overall") # %% # There are situations where knowing the ratios of the metrics evaluated on # the subgroups is more useful. For this we have the ``ratio()`` method. # We can take the ratios between the minimum and maximum values of each metric: grouped_on_race.ratio(method="between_groups") # %% # We can also compute the ratios relative to the overall value for each
"selection_rate": selection_rate, "count": count, }, sensitive_features=A_test, y_true=Y_test, y_pred=predictions[key], ) import matplotlib.pyplot as plt x = [ metric_frame.overall["accuracy"] for metric_frame in metric_frames.values() ] y = [ metric_frame.difference()["selection_rate"] for metric_frame in metric_frames.values() ] keys = list(metric_frames.keys()) plt.scatter(x, y) for i in range(len(x)): plt.annotate(keys[i], (x[i] + 0.0003, y[i])) plt.xlabel("accuracy") plt.ylabel("selection rate difference") # %% # We see a Pareto front forming - the set of predictors which represent optimal # tradeoffs between accuracy and disparity in predictions. In the ideal case, # we would have a predictor at (1,0) - perfectly accurate and without any # unfairness under demographic parity (with respect to the sensitive feature # "sex"). The Pareto front represents the closest we can come to this ideal