def test_specific_metrics(self): y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] exp_acc = group_accuracy_score(y_t, y_p, s_f) exp_roc = group_roc_auc_score(y_t, y_p, s_f) predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} actual = _create_group_metric_set(y_t, predictions, sensitive_feature, 'binary_classification') # Do some sanity checks validate_dashboard_dictionary(actual) assert actual['trueY'] == y_t assert actual['predictedY'][0] == y_p assert actual['precomputedFeatureBins'][0]['binVector'] == s_f assert len(actual['precomputedMetrics'][0][0]) == 10 # Cross check the two metrics we computed # Comparisons simplified because s_f was already {0,1} actual_acc = actual['precomputedMetrics'][0][0]['accuracy_score'] assert actual_acc['global'] == exp_acc.overall assert actual_acc['bins'] == list(exp_acc.by_group.values()) actual_roc = actual['precomputedMetrics'][0][0][ 'balanced_accuracy_score'] assert actual_roc['global'] == exp_roc.overall assert actual_roc['bins'] == list(exp_roc.by_group.values())
def test_two_models(): # Two models, single sensitive feature vector, no names Y_true = [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1] Y_pred = [[0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1], [1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0]] a, b = 'a', 'b' sensitive_features = [[b, a, a, b, b, a, a, b, b, a, b, a, b, a, b]] sf_int = [int(x == 'b') for x in sensitive_features[0]] result = create_group_metric_set('binary_classification', Y_true, Y_pred, sensitive_features) assert result['predictionType'] == 'binaryClassification' assert result['schemaType'] == 'groupMetricSet' assert result['schemaVersion'] == 0 assert isinstance(result['trueY'], list) assert np.array_equal(result['trueY'], Y_true) assert isinstance(result['precomputedFeatureBins'], list) assert len(result['precomputedFeatureBins']) == 1 bin_dict = result['precomputedFeatureBins'][0] assert isinstance(bin_dict, dict) assert np.array_equal(bin_dict['binVector'], sf_int) assert np.array_equal(bin_dict['binLabels'], ['a', 'b']) assert isinstance(result['predictedY'], list) assert len(result['predictedY']) == 2 for i in range(2): y_p = result['predictedY'][i] assert isinstance(y_p, list) assert np.array_equal(y_p, Y_pred[i]) assert isinstance(result['precomputedMetrics'], list) assert len(result['precomputedMetrics']) == 1 metrics_group_0 = result['precomputedMetrics'][0] assert isinstance(metrics_group_0, list) assert len(metrics_group_0) == 2 for i in range(2): metrics_g0_m0 = metrics_group_0[i] assert isinstance(metrics_g0_m0, dict) assert len(metrics_g0_m0) == 10 accuracy = metrics_g0_m0['accuracy_score'] assert isinstance(accuracy, dict) gmr = group_accuracy_score(Y_true, Y_pred[i], sensitive_features[0]) assert gmr.overall == pytest.approx(accuracy['global']) assert isinstance(accuracy['bins'], list) assert len(accuracy['bins']) == 2 assert gmr.by_group['a'] == pytest.approx(accuracy['bins'][0]) assert gmr.by_group['b'] == pytest.approx(accuracy['bins'][1]) roc_auc = metrics_g0_m0['balanced_accuracy_score'] assert isinstance(roc_auc, dict) gmr = group_roc_auc_score(Y_true, Y_pred[i], sensitive_features[0]) assert gmr.overall == pytest.approx(roc_auc['global']) assert isinstance(roc_auc['bins'], list) assert len(roc_auc['bins']) == 2 assert gmr.by_group['a'] == pytest.approx(roc_auc['bins'][0]) assert gmr.by_group['b'] == pytest.approx(roc_auc['bins'][1])
def test_group_roc_auc_score_average(): result = metrics.group_roc_auc_score(Y_true, Y_pred, groups, average='samples') expected_overall = skm.roc_auc_score(Y_true, Y_pred, average='samples') assert expected_overall == result.overall
def test_multiple_models_multiple_sensitive_features(): # Three models, two sensitive feature vectors, no names Y_true = [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0] Y_pred = [[0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0]] # First group is just 'a' and 'b'. Second is 4, 5 and 6 sensitive_features = [[ 'a', 'b', 'b', 'a', 'b', 'b', 'b', 'a', 'b', 'b', 'b' ], [4, 5, 6, 6, 5, 4, 4, 5, 5, 6, 6]] sf_int = [int(x == 'b') for x in sensitive_features[0]] result = create_group_metric_set('binary_classification', Y_true, Y_pred, sensitive_features) assert result['predictionType'] == 'binaryClassification' assert result['schemaType'] == 'groupMetricSet' assert result['schemaVersion'] == 0 assert isinstance(result['trueY'], list) assert np.array_equal(result['trueY'], Y_true) assert isinstance(result['precomputedFeatureBins'], list) assert len(result['precomputedFeatureBins']) == 2 bin_dict0 = result['precomputedFeatureBins'][0] assert isinstance(bin_dict0, dict) assert np.array_equal(bin_dict0['binVector'], sf_int) assert np.array_equal(bin_dict0['binLabels'], ['a', 'b']) bin_dict1 = result['precomputedFeatureBins'][1] assert isinstance(bin_dict1, dict) assert np.array_equal(bin_dict1['binVector'], [x - 4 for x in sensitive_features[1]]) assert np.array_equal(bin_dict1['binLabels'], ['4', '5', '6']) assert isinstance(result['predictedY'], list) assert len(result['predictedY']) == 3 for i in range(3): y_p = result['predictedY'][i] assert isinstance(y_p, list) assert np.array_equal(y_p, Y_pred[i]) assert isinstance(result['precomputedMetrics'], list) assert len(result['precomputedMetrics']) == 2 # Check the first grouping (with alphabetical labels) metrics_group_0 = result['precomputedMetrics'][0] assert isinstance(metrics_group_0, list) assert len(metrics_group_0) == 3 # Loop over the models for i in range(3): m_g0 = metrics_group_0[i] assert isinstance(m_g0, dict) assert len(m_g0) == 10 accuracy = m_g0['accuracy_score'] assert isinstance(accuracy, dict) gmr = group_accuracy_score(Y_true, Y_pred[i], sensitive_features[0]) assert gmr.overall == pytest.approx(accuracy['global']) assert isinstance(accuracy['bins'], list) assert len(accuracy['bins']) == 2 assert gmr.by_group['a'] == pytest.approx(accuracy['bins'][0]) assert gmr.by_group['b'] == pytest.approx(accuracy['bins'][1]) roc_auc = m_g0['balanced_accuracy_score'] assert isinstance(roc_auc, dict) gmr = group_roc_auc_score(Y_true, Y_pred[i], sensitive_features[0]) assert gmr.overall == pytest.approx(roc_auc['global']) assert isinstance(roc_auc['bins'], list) assert len(roc_auc['bins']) == 2 assert gmr.by_group['a'] == pytest.approx(roc_auc['bins'][0]) assert gmr.by_group['b'] == pytest.approx(roc_auc['bins'][1]) # Check the second grouping (three unique numeric labels) metrics_group_1 = result['precomputedMetrics'][1] assert isinstance(metrics_group_1, list) assert len(metrics_group_1) == 3 # Loop over the models for i in range(3): m_g1 = metrics_group_1[i] assert isinstance(m_g1, dict) assert len(m_g1) == 10 accuracy = m_g1['accuracy_score'] assert isinstance(accuracy, dict) gmr = group_accuracy_score(Y_true, Y_pred[i], sensitive_features[1]) assert gmr.overall == pytest.approx(accuracy['global']) assert isinstance(accuracy['bins'], list) assert len(accuracy['bins']) == 3 # Use the fact that the groups are integers for j in range(3): assert gmr.by_group[j + 4] == pytest.approx(accuracy['bins'][j]) roc_auc = m_g1['balanced_accuracy_score'] assert isinstance(roc_auc, dict) gmr = group_roc_auc_score(Y_true, Y_pred[i], sensitive_features[1]) assert gmr.overall == pytest.approx(roc_auc['global']) assert isinstance(roc_auc['bins'], list) assert len(roc_auc['bins']) == 3 for i in range(3): assert gmr.by_group[i + 4] == pytest.approx(roc_auc['bins'][i])
def test_group_roc_auc_score_max_fpr(): result = metrics.group_roc_auc_score(Y_true, Y_pred, groups, max_fpr=0.5) expected_overall = skm.roc_auc_score(Y_true, Y_pred, max_fpr=0.5) assert expected_overall == result.overall