def test_metrics_dict_per_subject() -> None: """ Ensure that adding per-subject predictions can correctly handle subject IDs """ hue1 = "H1" hue2 = "H2" m = ScalarMetricsDict(hues=[hue1, hue2], is_classification_metrics=True) m.add_predictions(["S1", "S2"], np.array([0.0, 1.0]), np.array([0.0, 1.0]), hue=hue1) m.add_predictions(["S1", "S2"], np.array([1.0, 0.0]), np.array([0.0, 1.0]), hue=hue2) predictions = m.get_predictions_and_labels_per_subject(hue=hue1) assert len(predictions) == 2
def plot_cross_validation_from_files( config_and_files: OfflineCrossvalConfigAndFiles, root_folder: Path) -> None: config = config_and_files.config if config.number_of_cross_validation_splits > 1: check_result_file_counts(config_and_files) result_files = config_and_files.files metrics_dfs = load_dataframes(result_files, config) full_csv_file = root_folder / FULL_METRICS_DATAFRAME_FILE initial_metrics = pd.concat(list(metrics_dfs.values())) if config.model_category == ModelCategory.Segmentation: if config.create_plots: plot_metrics(config, metrics_dfs, root_folder) save_outliers(config, metrics_dfs, root_folder) all_metrics, focus_splits = add_comparison_data( config, initial_metrics) all_metrics.to_csv(full_csv_file, index=False) run_statistical_tests_on_file(root_folder, full_csv_file, config, focus_splits) else: # For classification runs, we also want to compute the aggregated training metrics for # each fold. metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df( initial_metrics, config.model_category == ModelCategory.Classification) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics( metrics=metrics, data_frame_logger=DataframeLogger(csv_path=root_folder / METRICS_AGGREGATES_FILE)) # The full metrics file saves the prediction for each individual subject. Do not include the training # results in this file (as in cross-validation a subject is used in several folds.) val_and_test_metrics = initial_metrics.loc[initial_metrics[ LoggingColumns.DataSplit.value] != ModelExecutionMode.TRAIN.value] val_and_test_metrics.to_csv(full_csv_file, index=False) # Copy one instance of the dataset.CSV files to the root of the results folder. It is possible # that the different CV folds run with different dataset files, but not expected for classification # models at the moment (could change with ensemble models) dataset_csv = None for file in result_files: if file.dataset_csv_file: dataset_csv = file.dataset_csv_file break if dataset_csv: shutil.copy(str(dataset_csv), str(root_folder)) name_dct = config_and_files.config.short_names if name_dct: pairs = [(val, key) for key, val in name_dct.items()] with Path(root_folder / RUN_DICTIONARY_NAME).open("w") as out: max_len = max(len(short_name) for short_name, _ in pairs) for short_name, long_name in sorted(pairs): out.write(f"{short_name:{max_len}s} {long_name}\n")
def test_load_metrics_from_df_with_hue() -> None: """ Test loading of per-epoch predictions from a dataframe when the dataframe contains a prediction_target column. """ hue_name = "foo" hues = [MetricsDict.DEFAULT_HUE_KEY] * 2 + [hue_name] * 2 expected_epoch = 1 expected_mode = ModelExecutionMode.VAL expected_labels = [1] expected_subjects = ["A"] model_outputs_1 = [0.1, 0.2] model_outputs_2 = [0.3, 0.4] test_df = pd.DataFrame.from_dict({ LoggingColumns.Hue.value: hues, LoggingColumns.Epoch.value: [expected_epoch] * 4, LoggingColumns.DataSplit.value: [expected_mode.value] * 4, LoggingColumns.ModelOutput.value: model_outputs_1 + model_outputs_2, LoggingColumns.Label.value: expected_labels * 4, LoggingColumns.Patient.value: expected_subjects * 4 }) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(test_df, is_classification_metrics=True) assert expected_mode in metrics assert expected_epoch in metrics[expected_mode] metrics_dict = metrics[expected_mode][expected_epoch] assert metrics_dict.get_hue_names(include_default=False) == [hue_name] assert metrics_dict.get_predictions().flatten().tolist() == model_outputs_1 assert metrics_dict.get_predictions(hue=hue_name).flatten().tolist() == model_outputs_2
def test_load_metrics_from_df_with_hues(test_output_dirs: TestOutputDirectories) -> None: """ Test if we can re-create a MetricsDict object with model predictions and labels, when the data file contains a prediction target value. """ df_str = """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split 01,1,2137.00005,0.54349,1.0,0,Val 01,1,2137.00125,0.54324,0.0,1,Val 01,1,3250.00005,0.50822,0.0,0,Val 01,1,3250.12345,0.47584,0.0,1,Val 02,1,2137.00005,0.55538,1.0,0,Val 02,1,2137.00125,0.55759,0.0,1,Val 02,1,3250.00005,0.47255,0.0,0,Val 02,1,3250.12345,0.46996,0.0,1,Val 03,1,2137.00005,0.56670,1.0,0,Val 03,1,2137.00125,0.57003,0.0,1,Val 03,1,3250.00005,0.46321,0.0,0,Val 03,1,3250.12345,0.47309,0.0,1,Val """ df = pd.read_csv(StringIO(df_str), converters={LoggingColumns.Hue.value: lambda x: x}) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(df, is_classification_metrics=True) mode = ModelExecutionMode.VAL epoch = 1 assert mode in metrics assert epoch in metrics[mode] metrics_dict = metrics[mode][epoch] expected_hues = ["01", "02", "03"] assert metrics_dict.get_hue_names(include_default=False) == expected_hues for hue in expected_hues: assert len(metrics_dict._get_hue(hue).get_predictions()) == 4 logger_output_file = test_output_dirs.create_file_or_folder_path("output.csv") logger = DataframeLogger(csv_path=Path(logger_output_file)) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics(metrics, logger) output = pd.read_csv(logger_output_file, dtype=str) assert LoggingColumns.Hue.value in output assert list(output[LoggingColumns.Hue.value]) == expected_hues assert LoggingColumns.DataSplit.value in output assert list(output[LoggingColumns.DataSplit.value].unique()) == [ModelExecutionMode.VAL.value] assert LoggingColumns.Epoch.value in output assert list(output[LoggingColumns.Epoch.value].unique()) == ["1"] assert LoggingColumns.AreaUnderPRCurve.value in output assert list(output[LoggingColumns.AreaUnderPRCurve.value]) == ['1.00000', '0.25000', '0.25000']
def _compute_scalar_metrics( output_values_list: List[List[float]], labels: List[List[float]], is_classification: bool, hues: Optional[List[str]] = None) -> ScalarMetricsDict: model_output = torch.tensor(output_values_list) _labels = torch.tensor(labels) if machine_has_gpu: _labels = _labels.cuda() model_output = model_output.cuda() metrics_dict = ScalarMetricsDict( hues=hues, is_classification_metrics=is_classification) subject_ids = list(range(model_output.shape[0])) loss_type = ScalarLoss.BinaryCrossEntropyWithLogits if is_classification else ScalarLoss.MeanSquaredError compute_scalar_metrics(metrics_dict, subject_ids, model_output, _labels, loss_type=loss_type) return metrics_dict
def test_load_metrics_from_df() -> None: expected_epochs = [1] * 2 + [2] * 2 expected_modes = [ModelExecutionMode.VAL, ModelExecutionMode.TEST] * 2 expected_labels = [1] * 4 expected_subjects = ["A"] * 4 test_df = pd.DataFrame.from_dict({ LoggingColumns.Epoch.value: expected_epochs, LoggingColumns.DataSplit.value: [x.value for x in expected_modes], LoggingColumns.ModelOutput.value: [0.1, 0.2, 0.3, 0.4], LoggingColumns.Label.value: expected_labels, LoggingColumns.Patient.value: expected_subjects }) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(test_df, is_classification_metrics=True) for x in set(expected_modes): for e in set(expected_epochs): expected_df = test_df[ (test_df[LoggingColumns.DataSplit.value] == x.value) & (test_df[LoggingColumns.Epoch.value] == e)] metrics_dict = metrics[x][e] assert np.alltrue(expected_df[LoggingColumns.ModelOutput.value].values == metrics_dict.get_predictions()) assert np.alltrue(expected_df[LoggingColumns.Label.value].values == metrics_dict.get_labels()) assert np.alltrue(expected_df[LoggingColumns.Patient.value].values == metrics_dict.subject_ids())
def test_metrics_dic_subject_ids() -> None: hue1 = "H1" m = ScalarMetricsDict(hues=[hue1], is_classification_metrics=True) m.add_predictions(subject_ids=[0], predictions=np.zeros(1), labels=np.zeros(1), hue=hue1) assert m.subject_ids() == [] assert m.subject_ids(hue=hue1) == [0]
def compute_scalar_metrics( metrics_dict: ScalarMetricsDict, subject_ids: Sequence[str], model_output: torch.Tensor, labels: torch.Tensor, loss_type: ScalarLoss = ScalarLoss.BinaryCrossEntropyWithLogits ) -> None: """ Computes various metrics for a binary classification task from real-valued model output and a label vector, and stores them in the given `metrics_dict`. The model output is assumed to be in the range between 0 and 1, a value larger than 0.5 indicates a prediction of class 1. The label vector is expected to contain class indices 0 and 1 only. Metrics for each model output channel will be isolated, and a non-default hue for each model output channel is expected, and must exist in the provided metrics_dict. The Default hue is used for single model outputs. :param metrics_dict: An object that holds all metrics. It will be updated in-place. :param subject_ids: Subject ids for the model output and labels. :param model_output: A tensor containing model outputs. :param labels: A tensor containing class labels. :param loss_type: The type of loss that the model uses. This is required to optionally convert 2-dim model output to probabilities. """ _model_output_channels = model_output.shape[1] model_output_hues = metrics_dict.get_hue_names( include_default=len(metrics_dict.hues_without_default) == 0) if len(model_output_hues) < _model_output_channels: raise ValueError( "Hues must be provided for each model output channel, found " f"{_model_output_channels} channels but only {len(model_output_hues)} hues" ) for i, hue in enumerate(model_output_hues): # mask the model outputs and labels if required masked_model_outputs_and_labels = get_masked_model_outputs_and_labels( model_output[:, i, ...], labels[:, i, ...], subject_ids) # compute metrics on valid masked tensors only if masked_model_outputs_and_labels is not None: _model_output, _labels, _subject_ids = \ masked_model_outputs_and_labels.model_outputs.data, \ masked_model_outputs_and_labels.labels.data, \ masked_model_outputs_and_labels.subject_ids if loss_type == ScalarLoss.MeanSquaredError: metrics = { MetricType.MEAN_SQUARED_ERROR: F.mse_loss(_model_output, _labels.float(), reduction='mean').item(), MetricType.MEAN_ABSOLUTE_ERROR: mean_absolute_error(_model_output, _labels), MetricType.R2_SCORE: r2_score(_model_output, _labels) } else: metrics = { MetricType.CROSS_ENTROPY: F.binary_cross_entropy(_model_output, _labels.float(), reduction='mean').item(), MetricType.ACCURACY_AT_THRESHOLD_05: binary_classification_accuracy(_model_output, _labels) } for key, value in metrics.items(): if key == MetricType.R2_SCORE: # For a batch size 1, R2 score can be nan. We need to ignore nans # when average in case the last batch is of size 1. metrics_dict.add_metric(key, value, skip_nan_when_averaging=True, hue=hue) else: metrics_dict.add_metric(key, value, hue=hue) assert _subject_ids is not None metrics_dict.add_predictions(_subject_ids, _model_output.detach().cpu().numpy(), _labels.cpu().numpy(), hue=hue)