def _prepare(self, metric_fn): self.mfn = "Some name" self.target = metrics.MetricFrame( {self.mfn: metric_fn}, y_t, y_p, sensitive_features=pd.Series(data=g_2, name='sf0'), control_features=pd.Series(data=g_3, name='cf0')) assert isinstance(self.target.control_levels, list) assert (self.target.control_levels == ['cf0']) assert isinstance(self.target.sensitive_levels, list) assert (self.target.sensitive_levels == ['sf0']) mask_f = (g_2 == 'f') mask_g = (g_2 == 'g') mask_k = (g_3 == 'kk') mask_m = (g_3 == 'm') mask_k_f = np.logical_and(mask_k, mask_f) mask_k_g = np.logical_and(mask_k, mask_g) mask_m_f = np.logical_and(mask_m, mask_f) mask_m_g = np.logical_and(mask_m, mask_g) self.metric_k = metric_fn(y_t[mask_k], y_p[mask_k]) self.metric_m = metric_fn(y_t[mask_m], y_p[mask_m]) self.metric_k_f = metric_fn(y_t[mask_k_f], y_p[mask_k_f]) self.metric_m_f = metric_fn(y_t[mask_m_f], y_p[mask_m_f]) self.metric_k_g = metric_fn(y_t[mask_k_g], y_p[mask_k_g]) self.metric_m_g = metric_fn(y_t[mask_m_g], y_p[mask_m_g]) self.metric_k_arr = [self.metric_k_f, self.metric_k_g] self.metric_m_arr = [self.metric_m_f, self.metric_m_g]
def test_multid_input_output(): # In this, both y_t and y_p are 2d arrays # The metric results are also arrays metric_fn = functools.partial(skm.r2_score, multioutput="raw_values") y_t_2 = np.random.rand(len(g_1), 2) y_p_2 = np.random.rand(len(g_1), 2) target = metrics.MetricFrame(metrics=metric_fn, y_true=y_t_2, y_pred=y_p_2, sensitive_features=g_1) expected_overall = skm.r2_score(y_t_2, y_p_2, multioutput="raw_values") # Have to use allclose rather than equal since we don't know how # groupby will do its slicing assert np.allclose(target.overall, expected_overall, rtol=1e-12, atol=1e-10) for g in np.unique(g_1): mask = g_1 == g expected = skm.r2_score(y_t_2[mask], y_p_2[mask], multioutput="raw_values") actual = target.by_group[g] assert np.allclose(actual, expected, rtol=1e-12, atol=1e-10)
def test_missing_sensitive_feature_combinations(metric_fn): target = metrics.MetricFrame(metrics=metric_fn, y_true=y_t, y_pred=y_p, sensitive_features=np.stack([g_A, g_B], axis=1)) # Make sure our missing combination is in an expected place overall = metric_fn(y_t, y_p) direct_eval = [] for idx in target.by_group.index: mask_A = g_A == idx[0] mask_B = g_B == idx[1] mask = np.logical_and(mask_A, mask_B) if idx == ('bb', 'x'): assert sum(mask) == 0, 'idx={0}'.format(idx) else: assert sum(mask) != 0, 'idx={0}'.format(idx) nxt = metric_fn(y_t[mask], y_p[mask]) direct_eval.append(nxt) assert len(direct_eval) == 5 # Check we have expected values assert np.isnan(target.by_group[('bb', 'x')]) assert target.group_min() == min(direct_eval) assert target.group_max() == max(direct_eval) assert target.difference(method='between_groups') == \ max(direct_eval)-min(direct_eval) assert target.difference(method='to_overall') == \ max([abs(x-overall) for x in direct_eval]) assert target.ratio(method='between_groups') == \ min(direct_eval) / max(direct_eval) assert target.ratio(method='to_overall') == \ min([x/overall for x in direct_eval] + [overall/x for x in direct_eval])
def test_derived_difference_both_arg_types(): my_beta = 0.5 my_fn = metrics.make_derived_metric( metric=skm.fbeta_score, transform="difference", sample_param_names=["sample_weight"], ) my_fbeta = functools.partial(skm.fbeta_score, beta=my_beta) my_fbeta.__name__ = "my_fbeta" grouped = metrics.MetricFrame( metrics=my_fbeta, y_true=y_t, y_pred=y_p, sensitive_features=gid, sample_params={"sample_weight": wgt}, ) actual = my_fn( y_t, y_p, sensitive_features=gid, beta=my_beta, sample_weight=wgt, method="between_groups", ) assert actual == grouped.difference(method="between_groups")
def test_1m_1sf_1cf_metric_dict(transform_y_t, transform_y_p): # If there are failures here, other, more specific tests should also fail target = metrics.MetricFrame({'recall': skm.recall_score}, transform_y_t(y_t), transform_y_p(y_p), sensitive_features=g_2, control_features=g_3) # Check on the indices properties assert isinstance(target.control_levels, list) assert (target.control_levels == ['control_feature_0']) assert isinstance(target.sensitive_levels, list) assert (target.sensitive_levels == ['sensitive_feature_0']) # Check we have correct return types assert isinstance(target.overall, pd.DataFrame) assert isinstance(target.by_group, pd.DataFrame) mask_f = (g_2 == 'f') mask_g = (g_2 == 'g') mask_k = (g_3 == 'kk') mask_m = (g_3 == 'm') # Check we have expected number of elements assert target.overall.shape == (2, 1) assert target.by_group.shape == (4, 1) recall_k = skm.recall_score(y_t[mask_k], y_p[mask_k]) recall_m = skm.recall_score(y_t[mask_m], y_p[mask_m]) assert target.overall['recall']['kk'] == recall_k assert target.overall['recall']['m'] == recall_m mask_k_f = np.logical_and(mask_k, mask_f) mask_k_g = np.logical_and(mask_k, mask_g) mask_m_f = np.logical_and(mask_m, mask_f) mask_m_g = np.logical_and(mask_m, mask_g) recall_k_f = skm.recall_score(y_t[mask_k_f], y_p[mask_k_f]) recall_m_f = skm.recall_score(y_t[mask_m_f], y_p[mask_m_f]) recall_k_g = skm.recall_score(y_t[mask_k_g], y_p[mask_k_g]) recall_m_g = skm.recall_score(y_t[mask_m_g], y_p[mask_m_g]) assert target.by_group['recall'][('kk', 'f')] == recall_k_f assert target.by_group['recall'][('kk', 'g')] == recall_k_g assert target.by_group['recall'][('m', 'f')] == recall_m_f assert target.by_group['recall'][('m', 'g')] == recall_m_g recall_k_arr = [recall_k_f, recall_k_g] recall_m_arr = [recall_m_f, recall_m_g] target_mins = target.group_min() assert isinstance(target_mins, pd.DataFrame) assert target_mins.shape == (2, 1) assert target_mins['recall']['kk'] == min(recall_k_arr) assert target_mins['recall']['m'] == min(recall_m_arr) target_maxs = target.group_max() assert isinstance(target_mins, pd.DataFrame) assert target_maxs.shape == (2, 1) assert target_maxs['recall']['kk'] == max(recall_k_arr) assert target_maxs['recall']['m'] == max(recall_m_arr)
def test_single_element_lists(): mf = metrics.MetricFrame( metrics=skm.balanced_accuracy_score, y_true=[1], y_pred=[1], sensitive_features=[0], ) assert mf.overall == 1
def _prepare(self, metric_fn): self.target = metrics.MetricFrame(metric_fn, y_t, y_p, sensitive_features=list(g_2), control_features=np.stack([g_3, g_1], axis=1)) assert isinstance(self.target.control_levels, list) assert (self.target.control_levels == [ 'control_feature_0', 'control_feature_1' ]) assert isinstance(self.target.sensitive_levels, list) assert (self.target.sensitive_levels == ['sensitive_feature_0']) # Check we have correct return types assert isinstance(self.target.overall, pd.Series) assert isinstance(self.target.by_group, pd.Series) mask_a = (g_1 == 'aa') mask_b = (g_1 == 'ba') mask_f = (g_2 == 'f') mask_g = (g_2 == 'g') mask_k = (g_3 == 'kk') mask_m = (g_3 == 'm') mask_k_a = np.logical_and(mask_k, mask_a) mask_k_b = np.logical_and(mask_k, mask_b) mask_m_a = np.logical_and(mask_m, mask_a) mask_m_b = np.logical_and(mask_m, mask_b) mask_k_a_f = np.logical_and(mask_k_a, mask_f) mask_k_a_g = np.logical_and(mask_k_a, mask_g) mask_k_b_f = np.logical_and(mask_k_b, mask_f) mask_k_b_g = np.logical_and(mask_k_b, mask_g) mask_m_a_f = np.logical_and(mask_m_a, mask_f) mask_m_a_g = np.logical_and(mask_m_a, mask_g) mask_m_b_f = np.logical_and(mask_m_b, mask_f) mask_m_b_g = np.logical_and(mask_m_b, mask_g) self.metric_k_a = metric_fn(y_t[mask_k_a], y_p[mask_k_a]) self.metric_k_b = metric_fn(y_t[mask_k_b], y_p[mask_k_b]) self.metric_m_a = metric_fn(y_t[mask_m_a], y_p[mask_m_a]) self.metric_m_b = metric_fn(y_t[mask_m_b], y_p[mask_m_b]) self.metric_k_a_f = metric_fn(y_t[mask_k_a_f], y_p[mask_k_a_f]) self.metric_k_a_g = metric_fn(y_t[mask_k_a_g], y_p[mask_k_a_g]) self.metric_k_b_f = metric_fn(y_t[mask_k_b_f], y_p[mask_k_b_f]) self.metric_k_b_g = metric_fn(y_t[mask_k_b_g], y_p[mask_k_b_g]) self.metric_m_a_f = metric_fn(y_t[mask_m_a_f], y_p[mask_m_a_f]) self.metric_m_a_g = metric_fn(y_t[mask_m_a_g], y_p[mask_m_a_g]) self.metric_m_b_f = metric_fn(y_t[mask_m_b_f], y_p[mask_m_b_f]) self.metric_m_b_g = metric_fn(y_t[mask_m_b_g], y_p[mask_m_b_g]) self.metric_k_a_arr = [self.metric_k_a_f, self.metric_k_a_g] self.metric_k_b_arr = [self.metric_k_b_f, self.metric_k_b_g] self.metric_m_a_arr = [self.metric_m_a_f, self.metric_m_a_g] self.metric_m_b_arr = [self.metric_m_b_f, self.metric_m_b_g] self.mfn = metric_fn.__name__
def test_four_positional_arguments(): # The first formal positional argument to the constructor is "self" # so the error message says that five arguments were given msg = "__init__() takes 1 positional argument but 5 positional arguments were given" with pytest.raises(TypeError) as execInfo: _ = metrics.MetricFrame(skm.accuracy_score, y_true, y_pred, sf) assert execInfo.value.args[0] == msg
def test_duplicate_sf_names(): groups = pd.DataFrame(np.stack([g_2, g_3], axis=1), columns=["A", "A"]) msg = "Detected duplicate feature name: 'A'" with pytest.raises(ValueError) as execInfo: _ = metrics.MetricFrame(skm.recall_score, y_t, y_p, sensitive_features=groups) assert execInfo.value.args[0] == msg
def test_no_warnings(): with pytest.warns(None) as record: mf = metrics.MetricFrame( metrics=skm.accuracy_score, y_true=y_true, y_pred=y_pred, sensitive_features=sf) assert len(record) == 0 assert mf.difference() == pytest.approx(accuracy_score_difference)
def test_group_max(): my_fn = metrics.make_derived_metric(metric=skm.precision_score, transform='group_max', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(skm.precision_score, y_t, y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid) assert actual == grouped.group_max()
def test_derived_ratio_to_overall(): my_fn = metrics.make_derived_metric(metric=skm.precision_score, transform='ratio', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(skm.precision_score, y_t, y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid, method='to_overall') assert actual == grouped.ratio(method='to_overall')
def test_derived_ratio_default_is_between_groups(): my_fn = metrics.make_derived_metric(metric=skm.precision_score, transform='ratio', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(skm.precision_score, y_t, y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid) assert actual == grouped.ratio()
def test_derived_difference_to_overall(): my_fn = metrics.make_derived_metric(metric=skm.accuracy_score, transform='difference', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(skm.accuracy_score, y_t, y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid, method='to_overall') assert actual == grouped.difference(method='to_overall')
def test_derived_difference_default_is_between_groups(): my_fn = metrics.make_derived_metric(metric=skm.accuracy_score, transform='difference', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(skm.accuracy_score, y_t, y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid) assert actual == grouped.difference()
def test_duplicate_cf_sf_names(): cf = pd.DataFrame(np.stack([g_2, g_3], axis=1), columns=["A", "B"]) sf = {"B": g_1, "C": g_4} msg = "Detected duplicate feature name: 'B'" with pytest.raises(ValueError) as execInfo: _ = metrics.MetricFrame(skm.recall_score, y_t, y_p, sensitive_features=sf, control_features=cf) assert execInfo.value.args[0] == msg
def test_duplicate_cf_names(): groups = pd.DataFrame(np.stack([g_2, g_3], axis=1), columns=["B", "B"]) msg = "Detected duplicate feature name: 'B'" with pytest.raises(ValueError) as execInfo: _ = metrics.MetricFrame( metrics=skm.recall_score, y_true=y_t, y_pred=y_p, sensitive_features=g_4, control_features=groups, ) assert execInfo.value.args[0] == msg
def test_group_min(): my_fn = metrics.make_derived_metric( metric=skm.precision_score, transform="group_min", sample_param_names=["sample_weight"], ) grouped = metrics.MetricFrame(metrics=skm.precision_score, y_true=y_t, y_pred=y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid) assert actual == grouped.group_min()
def test_derived_ratio_to_overall(): my_fn = metrics.make_derived_metric( metric=skm.precision_score, transform="ratio", sample_param_names=["sample_weight"], ) grouped = metrics.MetricFrame(metrics=skm.precision_score, y_true=y_t, y_pred=y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid, method="to_overall") assert actual == grouped.ratio(method="to_overall")
def test_derived_difference_sample_arg(): my_fbeta = functools.partial(skm.fbeta_score, beta=0.6) my_fbeta.__name__ = "my_fbeta" my_fn = metrics.make_derived_metric(metric=my_fbeta, transform='difference', sample_param_names=['sample_weight']) grouped = metrics.MetricFrame(my_fbeta, y_t, y_p, sensitive_features=gid, sample_params={'sample_weight': wgt}) actual = my_fn(y_t, y_p, sensitive_features=gid, sample_weight=wgt, method='between_groups') assert actual == grouped.difference(method='between_groups')
def test_derived_difference_to_overall(): my_fn = metrics.make_derived_metric( metric=skm.accuracy_score, transform="difference", sample_param_names=["sample_weight"], ) grouped = metrics.MetricFrame(metrics=skm.accuracy_score, y_true=y_t, y_pred=y_p, sensitive_features=gid) actual = my_fn(y_t, y_p, sensitive_features=gid, method="to_overall") assert actual == grouped.difference(method="to_overall")
def test_1m_1sf_0cf(): target = metrics.MetricFrame({'confusion_matrix': skm.confusion_matrix}, y_t, y_p, sensitive_features=g_1) overall = skm.confusion_matrix(y_t, y_p) assert np.array_equal(target.overall['confusion_matrix'], overall) for g in np.unique(g_1): mask = g_1 == g expected = skm.confusion_matrix(y_t[mask], y_p[mask]) actual = target.by_group['confusion_matrix'][g] assert np.array_equal(actual, expected)
def _prepare(self, metric_fn): self.mfn = "Random name" self.target = metrics.MetricFrame(metrics={self.mfn: metric_fn}, y_true=y_t, y_pred=y_p, sensitive_features=g_4) assert self.target.control_levels is None assert isinstance(self.target.sensitive_levels, list) assert (self.target.sensitive_levels == ['sensitive_feature_0']) self.overall = metric_fn(y_t, y_p) mask_p = (g_4 == 'pp') mask_q = (g_4 == 'q') self.metric_p = metric_fn(y_t[mask_p], y_p[mask_p]) self.metric_q = metric_fn(y_t[mask_q], y_p[mask_q])
def test_one_positional_argument(): msg = ( "You have provided 'metrics' as positional arguments. Please pass them as" f" keyword arguments. From version {version} passing them as positional" " arguments will result in an error.") with pytest.warns(FutureWarning) as record: mf = metrics.MetricFrame(skm.accuracy_score, y_true=y_true, y_pred=y_pred, sensitive_features=sf) assert len(record) == 1 assert str(record[0].message) == msg assert mf.difference() == pytest.approx(accuracy_score_difference)
def test_keyword_metric(): msg = ("The positional argument 'metric' has been replaced by " f"a keyword argument 'metrics'. From version {version} passing " "it as a positional argument or as a keyword argument " "'metric' will result in an error") with pytest.warns(FutureWarning) as record: mf = metrics.MetricFrame( metric=skm.accuracy_score, y_true=y_true, y_pred=y_pred, sensitive_features=sf) assert len(record) == 1 assert str(record[0].message) == msg assert mf.difference() == pytest.approx(accuracy_score_difference)
def test_roc_auc(): ras = functools.partial(skm.roc_auc_score, multi_class='ovr', labels=[0, 1, 2]) target = metrics.MetricFrame(ras, y_true, y_pred, sensitive_features=s_f) overall = ras(y_true, y_pred) assert target.overall == overall for g in np.unique(s_f): mask = s_f == g expected = ras(y_true[mask], y_pred[mask]) actual = target.by_group[g] assert expected == actual
def test_derived_difference_both_arg_types_default_sample_param_names(): my_beta = 0.5 my_fn = metrics.make_derived_metric(metric=skm.fbeta_score, transform='difference') my_fbeta = functools.partial(skm.fbeta_score, beta=my_beta) my_fbeta.__name__ = "my_fbeta" grouped = metrics.MetricFrame(my_fbeta, y_t, y_p, sensitive_features=gid, sample_params={'sample_weight': wgt}) actual = my_fn(y_t, y_p, sensitive_features=gid, beta=my_beta, sample_weight=wgt) assert actual == grouped.difference()
def _prepare(self): fns = {'recall': skm.recall_score, 'prec': skm.precision_score} self.target = metrics.MetricFrame( fns, y_t, y_p, sensitive_features=pd.Series(data=g_4)) assert self.target.control_levels is None assert isinstance(self.target.sensitive_levels, list) assert (self.target.sensitive_levels == ['sensitive_feature_0']) self.recall = skm.recall_score(y_t, y_p) self.prec = skm.precision_score(y_t, y_p) mask_p = (g_4 == 'pp') mask_q = (g_4 == 'q') self.recall_p = skm.recall_score(y_t[mask_p], y_p[mask_p]) self.recall_q = skm.recall_score(y_t[mask_q], y_p[mask_q]) self.prec_p = skm.precision_score(y_t[mask_p], y_p[mask_p]) self.prec_q = skm.precision_score(y_t[mask_q], y_p[mask_q])
def test_1m_1_sf_sample_weights(): """Check that sample weights are passed correctly to a single metric.""" def multi_sp(y_t, y_p, p1, p2): """Metric to check passing of sample parameters. Verifies that p2 == y_t + y_p + p1 for all elements """ assert len(y_t) == len(y_p) assert len(y_t) == len(p1) assert len(y_t) == len(p2) assert np.array_equal( p2, np.asarray(y_t) + np.asarray(y_p) + np.asarray(p1)) return sum(p2) # Generate some random input data rng = np.random.default_rng(seed=42) param1 = rng.random(len(y_t)) # Compute the expected sum param2 = s_w + y_p + param1 # Note that we pass in the s_w array for y_true, to get # a little more variety in the results target = metrics.MetricFrame( metrics=multi_sp, y_true=s_w, y_pred=y_p, sensitive_features=g_1, sample_params={ "p1": param1, "p2": param2 }, ) # Sanity check types assert isinstance(target.overall, float) assert isinstance(target.by_group, pd.Series) assert target.by_group.shape == (2, ) # Check the overall value assert target.overall == sum(param2) # Look at the by_group values for each subgroup identified by g_1 for g in g_1: mask = g_1 == g assert target.by_group[g] == sum(param2[mask])
def test_multid_input_output(): # In this, both y_t and y_p are 2d arrays # The metric results are also arrays metric_fn = functools.partial(skm.r2_score, multioutput='raw_values') y_t_2 = np.random.rand(len(g_1), 2) y_p_2 = np.random.rand(len(g_1), 2) target = metrics.MetricFrame(metric_fn, y_t_2, y_p_2, sensitive_features=g_1) expected_overall = skm.r2_score(y_t_2, y_p_2, multioutput='raw_values') assert np.array_equal(target.overall, expected_overall) for g in np.unique(g_1): mask = g_1 == g expected = skm.r2_score(y_t_2[mask], y_p_2[mask], multioutput='raw_values') actual = target.by_group[g] assert np.array_equal(actual, expected)