def test_load_metrics_from_df_with_hues(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if we can re-create a MetricsDict object with model predictions and labels, when the data file contains
    a prediction target value.
    """
    df_str = """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split
01,1,2137.00005,0.54349,1.0,0,Val
01,1,2137.00125,0.54324,0.0,1,Val
01,1,3250.00005,0.50822,0.0,0,Val
01,1,3250.12345,0.47584,0.0,1,Val
02,1,2137.00005,0.55538,1.0,0,Val
02,1,2137.00125,0.55759,0.0,1,Val
02,1,3250.00005,0.47255,0.0,0,Val
02,1,3250.12345,0.46996,0.0,1,Val
03,1,2137.00005,0.56670,1.0,0,Val
03,1,2137.00125,0.57003,0.0,1,Val
03,1,3250.00005,0.46321,0.0,0,Val
03,1,3250.12345,0.47309,0.0,1,Val
"""
    df = pd.read_csv(StringIO(df_str),
                     converters={LoggingColumns.Hue.value: lambda x: x})
    metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(
        df, is_classification_metrics=True)
    mode = ModelExecutionMode.VAL
    epoch = 1
    assert mode in metrics
    assert epoch in metrics[mode]
    metrics_dict = metrics[mode][epoch]
    expected_hues = ["01", "02", "03"]
    assert metrics_dict.get_hue_names(include_default=False) == expected_hues
    for hue in expected_hues:
        assert len(metrics_dict._get_hue(hue).get_predictions()) == 4
    logger_output_file = test_output_dirs.create_file_or_folder_path(
        "output.csv")
    logger = DataframeLogger(csv_path=logger_output_file)
    ScalarMetricsDict.aggregate_and_save_execution_mode_metrics(
        metrics, logger)
    output = pd.read_csv(logger_output_file, dtype=str)
    assert LoggingColumns.Hue.value in output
    assert list(output[LoggingColumns.Hue.value]) == expected_hues
    assert LoggingColumns.DataSplit.value in output
    assert list(output[LoggingColumns.DataSplit.value].unique()) == [
        ModelExecutionMode.VAL.value
    ]
    assert LoggingColumns.Epoch.value in output
    assert list(output[LoggingColumns.Epoch.value].unique()) == ["1"]
    assert LoggingColumns.AreaUnderPRCurve.value in output
    assert list(output[LoggingColumns.AreaUnderPRCurve.value]) == [
        '1.00000', '0.25000', '0.25000'
    ]
def test_load_metrics_from_df_with_hue() -> None:
    """
    Test loading of per-epoch predictions from a dataframe when the dataframe contains a prediction_target column.
    """
    hue_name = "foo"
    hues = [MetricsDict.DEFAULT_HUE_KEY] * 2 + [hue_name] * 2
    expected_epoch = 1
    expected_mode = ModelExecutionMode.VAL
    expected_labels = [1]
    expected_subjects = ["A"]
    model_outputs_1 = [0.1, 0.2]
    model_outputs_2 = [0.3, 0.4]
    test_df = pd.DataFrame.from_dict({
        LoggingColumns.Hue.value:
        hues,
        LoggingColumns.Epoch.value: [expected_epoch] * 4,
        LoggingColumns.DataSplit.value: [expected_mode.value] * 4,
        LoggingColumns.ModelOutput.value:
        model_outputs_1 + model_outputs_2,
        LoggingColumns.Label.value:
        expected_labels * 4,
        LoggingColumns.Patient.value:
        expected_subjects * 4
    })
    metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(
        test_df, is_classification_metrics=True)
    assert expected_mode in metrics
    assert expected_epoch in metrics[expected_mode]
    metrics_dict = metrics[expected_mode][expected_epoch]
    assert metrics_dict.get_hue_names(include_default=False) == [hue_name]
    assert metrics_dict.get_predictions().flatten().tolist() == model_outputs_1
    assert metrics_dict.get_predictions(
        hue=hue_name).flatten().tolist() == model_outputs_2
def test_load_metrics_from_df() -> None:
    expected_epochs = [1] * 2 + [2] * 2
    expected_modes = [ModelExecutionMode.VAL, ModelExecutionMode.TEST] * 2
    expected_labels = [1] * 4
    expected_subjects = ["A"] * 4

    test_df = pd.DataFrame.from_dict({
        LoggingColumns.Epoch.value:
        expected_epochs,
        LoggingColumns.DataSplit.value: [x.value for x in expected_modes],
        LoggingColumns.ModelOutput.value: [0.1, 0.2, 0.3, 0.4],
        LoggingColumns.Label.value:
        expected_labels,
        LoggingColumns.Patient.value:
        expected_subjects
    })
    metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(
        test_df, is_classification_metrics=True)
    for x in set(expected_modes):
        for e in set(expected_epochs):
            expected_df = test_df[
                (test_df[LoggingColumns.DataSplit.value] == x.value)
                & (test_df[LoggingColumns.Epoch.value] == e)]
            metrics_dict = metrics[x][e]
            assert np.alltrue(expected_df[LoggingColumns.ModelOutput.value].
                              values == metrics_dict.get_predictions())
            assert np.alltrue(expected_df[LoggingColumns.Label.value].values ==
                              metrics_dict.get_labels())
            assert np.alltrue(expected_df[LoggingColumns.Patient.value].values
                              == metrics_dict.subject_ids())
Beispiel #4
0
def create_metrics_dict_for_scalar_models(config: ScalarModelBase) -> \
        Union[ScalarMetricsDict, SequenceMetricsDict]:
    """
    Create an instance of either a ScalarMetricsDict or SequenceMetricsDict, depending on whether the given
     configuration is a sequence model configuration or not.
    """
    return ScalarMetricsDict(
        hues=config.target_names,
        is_classification_metrics=config.is_classification_model)
def create_metrics_dict_for_scalar_models(config: ScalarModelBase) -> \
        Union[ScalarMetricsDict, SequenceMetricsDict]:
    """
    Create an instance of either a ScalarMetricsDict or SequenceMetricsDict, depending on whether the given
     configuration is a sequence model configuration or not.
    """
    if isinstance(config, SequenceModelBase):
        return SequenceMetricsDict.create(is_classification_model=config.is_classification_model,
                                          sequence_target_positions=config.sequence_target_positions)
    else:
        return ScalarMetricsDict(is_classification_metrics=config.is_classification_model)
def test_metrics_dic_subject_ids() -> None:
    hue1 = "H1"
    m = ScalarMetricsDict(hues=[hue1], is_classification_metrics=True)
    m.add_predictions(subject_ids=['0'],
                      predictions=np.zeros(1),
                      labels=np.zeros(1),
                      hue=hue1)
    assert m.subject_ids() == []
    assert m.subject_ids(hue=hue1) == ['0']
def test_metrics_dict_per_subject() -> None:
    """
    Ensure that adding per-subject predictions can correctly handle subject IDs
    """
    hue1 = "H1"
    hue2 = "H2"
    m = ScalarMetricsDict(hues=[hue1, hue2], is_classification_metrics=True)
    m.add_predictions(["S1", "S2"],
                      np.array([0.0, 1.0]),
                      np.array([0.0, 1.0]),
                      hue=hue1)
    m.add_predictions(["S1", "S2"],
                      np.array([1.0, 0.0]),
                      np.array([0.0, 1.0]),
                      hue=hue2)
    predictions = m.get_predictions_and_labels_per_subject(hue=hue1)
    assert len(predictions) == 2
def _compute_scalar_metrics(
        output_values_list: List[List[float]],
        labels: List[List[float]],
        is_classification: bool,
        hues: Optional[List[str]] = None) -> ScalarMetricsDict:
    model_output = torch.tensor(output_values_list)
    _labels = torch.tensor(labels)
    if machine_has_gpu:
        _labels = _labels.cuda()
        model_output = model_output.cuda()
    metrics_dict = ScalarMetricsDict(
        hues=hues, is_classification_metrics=is_classification)
    subject_ids = list(map(str, range(model_output.shape[0])))
    loss_type = ScalarLoss.BinaryCrossEntropyWithLogits if is_classification else ScalarLoss.MeanSquaredError
    compute_scalar_metrics(metrics_dict,
                           subject_ids,
                           model_output,
                           _labels,
                           loss_type=loss_type)
    return metrics_dict
def plot_cross_validation_from_files(
        config_and_files: OfflineCrossvalConfigAndFiles,
        root_folder: Path,
        is_ensemble_run: bool = False) -> None:
    """
    Runs various plots for the results of a cross validation run, and writes them to a given folder.
    :param config_and_files: The setup for plotting results and the set of data files to analyse.
    :param root_folder: The folder into which the results should be written.
    :param is_ensemble_run: If True, assume that this run of cross validation analysis is for an ensemble model
    and assert that there are N+1 data files available. If false, this analysis only concerns the cross
    validation runs, and check that the number of files is N.
    """
    config = config_and_files.config
    if config.number_of_cross_validation_splits > 1:
        check_result_file_counts(config_and_files,
                                 is_ensemble_run=is_ensemble_run)
    result_files = config_and_files.files
    metrics_dfs = load_dataframes(result_files, config)
    full_csv_file = root_folder / FULL_METRICS_DATAFRAME_FILE
    initial_metrics = pd.concat(list(metrics_dfs.values()))
    if config.model_category == ModelCategory.Segmentation:
        if config.create_plots:
            plot_metrics(config, metrics_dfs, root_folder)
        save_outliers(config, metrics_dfs, root_folder)
        all_metrics, focus_splits = add_comparison_data(
            config, initial_metrics)
        all_metrics.to_csv(full_csv_file, index=False)
        run_statistical_tests_on_file(root_folder, full_csv_file, config,
                                      focus_splits)
    else:
        # For classification runs, we also want to compute the aggregated training metrics for
        # each fold.
        metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(
            initial_metrics,
            config.model_category == ModelCategory.Classification)
        ScalarMetricsDict.aggregate_and_save_execution_mode_metrics(
            metrics=metrics,
            data_frame_logger=DataframeLogger(csv_path=root_folder /
                                              METRICS_AGGREGATES_FILE))
        # The full metrics file saves the prediction for each individual subject. Do not include the training
        # results in this file (as in cross-validation a subject is used in several folds.)
        val_and_test_metrics = initial_metrics.loc[initial_metrics[
            LoggingColumns.DataSplit.value] != ModelExecutionMode.TRAIN.value]
        val_and_test_metrics.to_csv(full_csv_file, index=False)

        # Copy one instance of the dataset.CSV files to the root of the results folder. It is possible
        # that the different CV folds run with different dataset files, but not expected for classification
        # models at the moment (could change with ensemble models)
        dataset_csv = None
        for file in result_files:
            if file.dataset_csv_file:
                dataset_csv = file.dataset_csv_file
                break
        if dataset_csv:
            shutil.copy(str(dataset_csv), str(root_folder))
    name_dct = config_and_files.config.short_names
    if name_dct:
        pairs = [(val, key) for key, val in name_dct.items()]
        with Path(root_folder / RUN_DICTIONARY_NAME).open("w") as out:
            max_len = max(len(short_name) for short_name, _ in pairs)
            for short_name, long_name in sorted(pairs):
                out.write(f"{short_name:{max_len}s}    {long_name}\n")
Beispiel #10
0
def compute_scalar_metrics(metrics_dict: ScalarMetricsDict,
                           subject_ids: Sequence[str],
                           model_output: torch.Tensor,
                           labels: torch.Tensor,
                           loss_type: ScalarLoss = ScalarLoss.BinaryCrossEntropyWithLogits) -> None:
    """
    Computes various metrics for a binary classification task from real-valued model output and a label vector,
    and stores them in the given `metrics_dict`.
    The model output is assumed to be in the range between 0 and 1, a value larger than 0.5 indicates a prediction
    of class 1. The label vector is expected to contain class indices 0 and 1 only.
    Metrics for each model output channel will be isolated, and a non-default hue for each model output channel is
    expected, and must exist in the provided metrics_dict. The Default hue is used for single model outputs.
    :param metrics_dict: An object that holds all metrics. It will be updated in-place.
    :param subject_ids: Subject ids for the model output and labels.
    :param model_output: A tensor containing model outputs.
    :param labels: A tensor containing class labels.
    :param loss_type: The type of loss that the model uses. This is required to optionally convert 2-dim model output
    to probabilities.
    """
    _model_output_channels = model_output.shape[1]
    model_output_hues = metrics_dict.get_hue_names(include_default=len(metrics_dict.hues_without_default) == 0)

    if len(model_output_hues) < _model_output_channels:
        raise ValueError("Hues must be provided for each model output channel, found "
                         f"{_model_output_channels} channels but only {len(model_output_hues)} hues")

    for i, hue in enumerate(model_output_hues):
        # mask the model outputs and labels if required
        masked_model_outputs_and_labels = get_masked_model_outputs_and_labels(
            model_output[:, i, ...], labels[:, i, ...], subject_ids)

        # compute metrics on valid masked tensors only
        if masked_model_outputs_and_labels is not None:
            _model_output, _labels, _subject_ids = \
                masked_model_outputs_and_labels.model_outputs.data, \
                masked_model_outputs_and_labels.labels.data, \
                masked_model_outputs_and_labels.subject_ids
            # Convert labels to the same datatype as the model outputs, necessary when running with AMP
            _labels = _labels.to(dtype=_model_output.dtype)
            if loss_type == ScalarLoss.MeanSquaredError:
                metrics = {
                    MetricType.MEAN_SQUARED_ERROR: F.mse_loss(_model_output, _labels, reduction='mean').item(),
                    MetricType.MEAN_ABSOLUTE_ERROR: mean_absolute_error(_model_output, _labels),
                    MetricType.EXPLAINED_VAR: r2_score(_model_output, _labels)
                }
            else:
                metrics = {
                    MetricType.CROSS_ENTROPY: F.binary_cross_entropy(_model_output, _labels, reduction='mean').item(),
                    MetricType.ACCURACY_AT_THRESHOLD_05: binary_classification_accuracy(_model_output, _labels)
                }
            for key, value in metrics.items():
                if key == MetricType.EXPLAINED_VAR:
                    # For a batch size 1, R2 score can be nan. We need to ignore nans
                    # when average in case the last batch is of size 1.
                    metrics_dict.add_metric(key, value, skip_nan_when_averaging=True, hue=hue)
                else:
                    metrics_dict.add_metric(key, value, hue=hue)

            assert _subject_ids is not None
            metrics_dict.add_predictions(_subject_ids, _model_output.detach().cpu().numpy(),
                                         _labels.cpu().numpy(), hue=hue)