def test_classification_metrics_avg() -> None: hue1 = "H1" hue2 = "H2" m = MetricsDict(hues=[hue1, hue2], is_classification_metrics=True) m.add_metric("foo", 1.0) m.add_metric("foo", 2.0) # Perfect predictions for hue1, should give AUC == 1.0 m.add_predictions(["S1", "S2"], np.array([0.0, 1.0]), np.array([0.0, 1.0]), hue=hue1) expected_hue1_auc = 1.0 # Worst possible predictions for hue2, should give AUC == 0.0 m.add_predictions(["S1", "S2"], np.array([1.0, 0.0]), np.array([0.0, 1.0]), hue=hue2) expected_hue2_auc = 0.0 averaged = m.average(across_hues=False) g1_averaged = averaged.values(hue=hue1) assert MetricType.AREA_UNDER_ROC_CURVE.value in g1_averaged assert g1_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [ expected_hue1_auc ] assert MetricType.AREA_UNDER_PR_CURVE.value in g1_averaged assert MetricType.SUBJECT_COUNT.value in g1_averaged assert g1_averaged[MetricType.SUBJECT_COUNT.value] == [2.0] default_averaged = averaged.values() assert default_averaged == {"foo": [1.5]} can_enumerate = list(averaged.enumerate_single_values()) assert len(can_enumerate) >= 8 assert can_enumerate[0] == (hue1, MetricType.AREA_UNDER_ROC_CURVE.value, 1.0) assert can_enumerate[-1] == (MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5) g2_averaged = averaged.values(hue=hue2) assert MetricType.AREA_UNDER_ROC_CURVE.value in g2_averaged assert g2_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [ expected_hue2_auc ] averaged_across_hues = m.average(across_hues=True) assert averaged_across_hues.get_hue_names() == [ MetricsDict.DEFAULT_HUE_KEY ] assert MetricType.AREA_UNDER_ROC_CURVE.value in averaged_across_hues.values( ) expected_averaged_auc = 0.5 * (expected_hue1_auc + expected_hue2_auc) assert averaged_across_hues.values()[ MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_averaged_auc]
def test_metrics_dict1() -> None: """ Test insertion of scalar values into a MetricsDict. """ m = MetricsDict() assert m.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY] name = "foo" v1 = 2.7 v2 = 3.14 m.add_metric(name, v1) m.add_metric(name, v2) assert m.values()[name] == [v1, v2] with pytest.raises(ValueError) as ex: # noinspection PyTypeChecker m.add_metric(name, [1.0]) # type: ignore assert "Expected the metric to be a scalar" in str(ex) assert m.skip_nan_when_averaging[name] is False v3 = 3.0 name2 = "bar" m.add_metric(name2, v3, skip_nan_when_averaging=True) assert m.skip_nan_when_averaging[name2] is True # Expected average: Metric "foo" averages over two values v1 and v2. For "bar", we only inserted one value anyhow average = m.average() mean_v1_v2 = mean([v1, v2]) assert average.values() == {name: [mean_v1_v2], name2: [v3]} num_entries = m.num_entries() assert num_entries == {name: 2, name2: 1}
def test_metrics_dict_flatten(hues: Optional[List[str]]) -> None: m = MetricsDict(hues=hues) _hues = hues or [MetricsDict.DEFAULT_HUE_KEY] * 2 m.add_metric("foo", 1.0, hue=_hues[0]) m.add_metric("foo", 2.0, hue=_hues[1]) m.add_metric("bar", 3.0, hue=_hues[0]) m.add_metric("bar", 4.0, hue=_hues[1]) if hues is None: average = m.average(across_hues=True) # We should be able to flatten out all the singleton values that the `average` operation returns all_values = list(average.enumerate_single_values()) assert all_values == [(MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5), (MetricsDict.DEFAULT_HUE_KEY, "bar", 3.5)] # When trying to flatten off a dictionary that has two values, this should fail: with pytest.raises(ValueError) as ex: list(m.enumerate_single_values()) assert "only hold 1 item" in str(ex) else: average = m.average(across_hues=False) all_values = list(average.enumerate_single_values()) assert all_values == [('A', 'foo', 1.0), ('A', 'bar', 3.0), ('B', 'foo', 2.0), ('B', 'bar', 4.0)]
def test_metrics_dict_average_metrics_averaging() -> None: """ Test if averaging metrics avoid NaN as expected. """ m = MetricsDict() metric1 = "foo" v1 = 1.0 m.add_metric(metric1, v1) m.add_metric(metric1, np.nan, skip_nan_when_averaging=True) metric2 = "bar" v2 = 2.0 m.add_metric(metric2, v2) m.add_metric(metric2, np.nan, skip_nan_when_averaging=False) average = m.average() assert average.values()[metric1] == [v1] assert np.isnan(average.values()[metric2])
def test_metrics_dict_average_additional_metrics() -> None: """ Test if computing the ROC entries and metrics at optimal threshold with MetricsDict.average() works as expected and returns the correct values. """ # Prepare a vector of predictions and labels. predictions = np.array([0.5, 0.6, 0.1, 0.8, 0.2, 0.9]) labels = np.array([0, 1.0, 0, 0, 1, 1], dtype=np.float) split_length = [3, 2, 1] # Get MetricsDict assert sum(split_length) == len(predictions) summed = np.cumsum(split_length) # MetricsDict will get that supplied in 3 chunks. m = MetricsDict() for i, end in enumerate(summed): start = 0 if i == 0 else summed[i - 1] pred = predictions[start:end] label = labels[start:end] subject_ids = list(map(str, range(len(pred)))) m.add_predictions(subject_ids, pred, label) assert m.has_prediction_entries # Compute average MetricsDict averaged = m.average() # Compute additional expected metrics for the averaged MetricsDict expected_auc = roc_auc_score(labels, predictions) expected_fpr, expected_tpr, thresholds = roc_curve(labels, predictions) expected_optimal_idx = np.argmax(expected_tpr - expected_fpr) expected_optimal_threshold = float(thresholds[expected_optimal_idx]) expected_accuracy = np.mean( (predictions > expected_optimal_threshold) == labels) # Check computed values against expected assert averaged.values()[MetricType.OPTIMAL_THRESHOLD.value][ 0] == pytest.approx(expected_optimal_threshold) assert averaged.values()[MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD. value][0] == pytest.approx(expected_accuracy) assert averaged.values()[MetricType.FALSE_POSITIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \ pytest.approx(expected_fpr[expected_optimal_idx]) assert averaged.values()[MetricType.FALSE_NEGATIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \ pytest.approx(1 - expected_tpr[expected_optimal_idx]) assert averaged.values()[ MetricType.AREA_UNDER_ROC_CURVE.value][0] == pytest.approx( expected_auc, 1e-6)