def test_classification_metrics_avg() -> None: hue1 = "H1" hue2 = "H2" m = MetricsDict(hues=[hue1, hue2], is_classification_metrics=True) m.add_metric("foo", 1.0) m.add_metric("foo", 2.0) # Perfect predictions for hue1, should give AUC == 1.0 m.add_predictions(["S1", "S2"], np.array([0.0, 1.0]), np.array([0.0, 1.0]), hue=hue1) expected_hue1_auc = 1.0 # Worst possible predictions for hue2, should give AUC == 0.0 m.add_predictions(["S1", "S2"], np.array([1.0, 0.0]), np.array([0.0, 1.0]), hue=hue2) expected_hue2_auc = 0.0 averaged = m.average(across_hues=False) g1_averaged = averaged.values(hue=hue1) assert MetricType.AREA_UNDER_ROC_CURVE.value in g1_averaged assert g1_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_hue1_auc] assert MetricType.AREA_UNDER_PR_CURVE.value in g1_averaged assert MetricType.SUBJECT_COUNT.value in g1_averaged assert g1_averaged[MetricType.SUBJECT_COUNT.value] == [2.0] default_averaged = averaged.values() assert default_averaged == {"foo": [1.5]} can_enumerate = list(averaged.enumerate_single_values()) assert len(can_enumerate) >= 8 assert can_enumerate[0] == (hue1, MetricType.AREA_UNDER_ROC_CURVE.value, 1.0) assert can_enumerate[-1] == (MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5) g2_averaged = averaged.values(hue=hue2) assert MetricType.AREA_UNDER_ROC_CURVE.value in g2_averaged assert g2_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_hue2_auc] averaged_across_hues = m.average(across_hues=True) assert averaged_across_hues.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY] assert MetricType.AREA_UNDER_ROC_CURVE.value in averaged_across_hues.values() expected_averaged_auc = 0.5 * (expected_hue1_auc + expected_hue2_auc) assert averaged_across_hues.values()[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_averaged_auc]
def test_metrics_dict1() -> None: """ Test insertion of scalar values into a MetricsDict. """ m = MetricsDict() assert m.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY] name = "foo" v1 = 2.7 v2 = 3.14 m.add_metric(name, v1) m.add_metric(name, v2) assert m.values()[name] == [v1, v2] with pytest.raises(ValueError) as ex: # noinspection PyTypeChecker m.add_metric(name, [1.0]) # type: ignore assert "Expected the metric to be a scalar" in str(ex) assert m.skip_nan_when_averaging[name] is False v3 = 3.0 name2 = "bar" m.add_metric(name2, v3, skip_nan_when_averaging=True) assert m.skip_nan_when_averaging[name2] is True # Expected average: Metric "foo" averages over two values v1 and v2. For "bar", we only inserted one value anyhow average = m.average() mean_v1_v2 = mean([v1, v2]) assert average.values() == {name: [mean_v1_v2], name2: [v3]} num_entries = m.num_entries() assert num_entries == {name: 2, name2: 1}
def aggregate_segmentation_metrics(metrics: MetricsDict) -> MetricsDict: """ Computes aggregate metrics for segmentation models, from a metrics dictionary that contains the results for individual minibatches. Specifically, average Dice scores for only the foreground structures and proportions of foreground voxels are computed. All metrics for the background class will be removed. All other metrics that are already present in the input metrics will be averaged and available in the result. Diagnostic values present in the input will be passed through unchanged. :param metrics: A metrics dictionary that contains the per-minibatch results. """ class_names_with_background = metrics.get_hue_names(include_default=False) has_background_class = class_names_with_background[0] == BACKGROUND_CLASS_NAME foreground_classes = class_names_with_background[1:] if has_background_class else class_names_with_background result = metrics.average(across_hues=False) result.diagnostics = metrics.diagnostics.copy() if has_background_class: result.delete_hue(BACKGROUND_CLASS_NAME) add_average_foreground_dice(result) # Total number of voxels per class, including the background class total_voxels = [] voxel_count = MetricType.VOXEL_COUNT.value for g in class_names_with_background: values = metrics.values(hue=g) if voxel_count in values: total_voxels.append(sum(values[voxel_count])) if len(total_voxels) > 0: # Proportion of voxels in foreground classes only proportion_foreground = np.array(total_voxels[1:], dtype=float) / sum(total_voxels) for i, foreground_class in enumerate(foreground_classes): result.add_metric(MetricType.PROPORTION_FOREGROUND_VOXELS, proportion_foreground[i], hue=foreground_class) result.add_metric(MetricType.PROPORTION_FOREGROUND_VOXELS, np.sum(proportion_foreground).item()) return result
def test_metrics_dict_flatten(hues: Optional[List[str]]) -> None: m = MetricsDict(hues=hues) _hues = hues or [MetricsDict.DEFAULT_HUE_KEY] * 2 m.add_metric("foo", 1.0, hue=_hues[0]) m.add_metric("foo", 2.0, hue=_hues[1]) m.add_metric("bar", 3.0, hue=_hues[0]) m.add_metric("bar", 4.0, hue=_hues[1]) if hues is None: average = m.average(across_hues=True) # We should be able to flatten out all the singleton values that the `average` operation returns all_values = list(average.enumerate_single_values()) assert all_values == [(MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5), (MetricsDict.DEFAULT_HUE_KEY, "bar", 3.5)] # When trying to flatten off a dictionary that has two values, this should fail: with pytest.raises(ValueError) as ex: list(m.enumerate_single_values()) assert "only hold 1 item" in str(ex) else: average = m.average(across_hues=False) all_values = list(average.enumerate_single_values()) assert all_values == [('A', 'foo', 1.0), ('A', 'bar', 3.0), ('B', 'foo', 2.0), ('B', 'bar', 4.0)]
def test_metrics_dict_average_metrics_averaging() -> None: """ Test if averaging metrics avoid NaN as expected. """ m = MetricsDict() metric1 = "foo" v1 = 1.0 m.add_metric(metric1, v1) m.add_metric(metric1, np.nan, skip_nan_when_averaging=True) metric2 = "bar" v2 = 2.0 m.add_metric(metric2, v2) m.add_metric(metric2, np.nan, skip_nan_when_averaging=False) average = m.average() assert average.values()[metric1] == [v1] assert np.isnan(average.values()[metric2])
def test_metrics_dict_average_additional_metrics() -> None: """ Test if computing the ROC entries and metrics at optimal threshold with MetricsDict.average() works as expected and returns the correct values. """ # Prepare a vector of predictions and labels. predictions = np.array([0.5, 0.6, 0.1, 0.8, 0.2, 0.9]) labels = np.array([0, 1.0, 0, 0, 1, 1], dtype=np.float) split_length = [3, 2, 1] # Get MetricsDict assert sum(split_length) == len(predictions) summed = np.cumsum(split_length) # MetricsDict will get that supplied in 3 chunks. m = MetricsDict() for i, end in enumerate(summed): start = 0 if i == 0 else summed[i - 1] pred = predictions[start:end] label = labels[start:end] subject_ids = list(range(len(pred))) m.add_predictions(subject_ids, pred, label) assert m.has_prediction_entries # Compute average MetricsDict averaged = m.average() # Compute additional expected metrics for the averaged MetricsDict expected_auc = roc_auc_score(labels, predictions) expected_fpr, expected_tpr, thresholds = roc_curve(labels, predictions) expected_optimal_idx = np.argmax(expected_tpr - expected_fpr) expected_optimal_threshold = float(thresholds[expected_optimal_idx]) expected_accuracy = np.mean((predictions > expected_optimal_threshold) == labels) # Check computed values against expected assert averaged.values()[MetricType.OPTIMAL_THRESHOLD.value][0] == pytest.approx(expected_optimal_threshold) assert averaged.values()[MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD.value][0] == pytest.approx(expected_accuracy) assert averaged.values()[MetricType.FALSE_POSITIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \ pytest.approx(expected_fpr[expected_optimal_idx]) assert averaged.values()[MetricType.FALSE_NEGATIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \ pytest.approx(1 - expected_tpr[expected_optimal_idx]) assert averaged.values()[MetricType.AREA_UNDER_ROC_CURVE.value][0] == pytest.approx(expected_auc, 1e-6)