def test_load_metrics_from_df_with_hue() -> None: """ Test loading of per-epoch predictions from a dataframe when the dataframe contains a prediction_target column. """ hue_name = "foo" hues = [MetricsDict.DEFAULT_HUE_KEY] * 2 + [hue_name] * 2 expected_epoch = 1 expected_mode = ModelExecutionMode.VAL expected_labels = [1] expected_subjects = ["A"] model_outputs_1 = [0.1, 0.2] model_outputs_2 = [0.3, 0.4] test_df = pd.DataFrame.from_dict({ LoggingColumns.Hue.value: hues, LoggingColumns.Epoch.value: [expected_epoch] * 4, LoggingColumns.DataSplit.value: [expected_mode.value] * 4, LoggingColumns.ModelOutput.value: model_outputs_1 + model_outputs_2, LoggingColumns.Label.value: expected_labels * 4, LoggingColumns.Patient.value: expected_subjects * 4 }) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(test_df, is_classification_metrics=True) assert expected_mode in metrics assert expected_epoch in metrics[expected_mode] metrics_dict = metrics[expected_mode][expected_epoch] assert metrics_dict.get_hue_names(include_default=False) == [hue_name] assert metrics_dict.get_predictions().flatten().tolist() == model_outputs_1 assert metrics_dict.get_predictions(hue=hue_name).flatten().tolist() == model_outputs_2
def plot_cross_validation_from_files( config_and_files: OfflineCrossvalConfigAndFiles, root_folder: Path) -> None: config = config_and_files.config if config.number_of_cross_validation_splits > 1: check_result_file_counts(config_and_files) result_files = config_and_files.files metrics_dfs = load_dataframes(result_files, config) full_csv_file = root_folder / FULL_METRICS_DATAFRAME_FILE initial_metrics = pd.concat(list(metrics_dfs.values())) if config.model_category == ModelCategory.Segmentation: if config.create_plots: plot_metrics(config, metrics_dfs, root_folder) save_outliers(config, metrics_dfs, root_folder) all_metrics, focus_splits = add_comparison_data( config, initial_metrics) all_metrics.to_csv(full_csv_file, index=False) run_statistical_tests_on_file(root_folder, full_csv_file, config, focus_splits) else: # For classification runs, we also want to compute the aggregated training metrics for # each fold. metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df( initial_metrics, config.model_category == ModelCategory.Classification) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics( metrics=metrics, data_frame_logger=DataframeLogger(csv_path=root_folder / METRICS_AGGREGATES_FILE)) # The full metrics file saves the prediction for each individual subject. Do not include the training # results in this file (as in cross-validation a subject is used in several folds.) val_and_test_metrics = initial_metrics.loc[initial_metrics[ LoggingColumns.DataSplit.value] != ModelExecutionMode.TRAIN.value] val_and_test_metrics.to_csv(full_csv_file, index=False) # Copy one instance of the dataset.CSV files to the root of the results folder. It is possible # that the different CV folds run with different dataset files, but not expected for classification # models at the moment (could change with ensemble models) dataset_csv = None for file in result_files: if file.dataset_csv_file: dataset_csv = file.dataset_csv_file break if dataset_csv: shutil.copy(str(dataset_csv), str(root_folder)) name_dct = config_and_files.config.short_names if name_dct: pairs = [(val, key) for key, val in name_dct.items()] with Path(root_folder / RUN_DICTIONARY_NAME).open("w") as out: max_len = max(len(short_name) for short_name, _ in pairs) for short_name, long_name in sorted(pairs): out.write(f"{short_name:{max_len}s} {long_name}\n")
def test_load_metrics_from_df_with_hues(test_output_dirs: TestOutputDirectories) -> None: """ Test if we can re-create a MetricsDict object with model predictions and labels, when the data file contains a prediction target value. """ df_str = """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split 01,1,2137.00005,0.54349,1.0,0,Val 01,1,2137.00125,0.54324,0.0,1,Val 01,1,3250.00005,0.50822,0.0,0,Val 01,1,3250.12345,0.47584,0.0,1,Val 02,1,2137.00005,0.55538,1.0,0,Val 02,1,2137.00125,0.55759,0.0,1,Val 02,1,3250.00005,0.47255,0.0,0,Val 02,1,3250.12345,0.46996,0.0,1,Val 03,1,2137.00005,0.56670,1.0,0,Val 03,1,2137.00125,0.57003,0.0,1,Val 03,1,3250.00005,0.46321,0.0,0,Val 03,1,3250.12345,0.47309,0.0,1,Val """ df = pd.read_csv(StringIO(df_str), converters={LoggingColumns.Hue.value: lambda x: x}) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(df, is_classification_metrics=True) mode = ModelExecutionMode.VAL epoch = 1 assert mode in metrics assert epoch in metrics[mode] metrics_dict = metrics[mode][epoch] expected_hues = ["01", "02", "03"] assert metrics_dict.get_hue_names(include_default=False) == expected_hues for hue in expected_hues: assert len(metrics_dict._get_hue(hue).get_predictions()) == 4 logger_output_file = test_output_dirs.create_file_or_folder_path("output.csv") logger = DataframeLogger(csv_path=Path(logger_output_file)) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics(metrics, logger) output = pd.read_csv(logger_output_file, dtype=str) assert LoggingColumns.Hue.value in output assert list(output[LoggingColumns.Hue.value]) == expected_hues assert LoggingColumns.DataSplit.value in output assert list(output[LoggingColumns.DataSplit.value].unique()) == [ModelExecutionMode.VAL.value] assert LoggingColumns.Epoch.value in output assert list(output[LoggingColumns.Epoch.value].unique()) == ["1"] assert LoggingColumns.AreaUnderPRCurve.value in output assert list(output[LoggingColumns.AreaUnderPRCurve.value]) == ['1.00000', '0.25000', '0.25000']
def test_load_metrics_from_df() -> None: expected_epochs = [1] * 2 + [2] * 2 expected_modes = [ModelExecutionMode.VAL, ModelExecutionMode.TEST] * 2 expected_labels = [1] * 4 expected_subjects = ["A"] * 4 test_df = pd.DataFrame.from_dict({ LoggingColumns.Epoch.value: expected_epochs, LoggingColumns.DataSplit.value: [x.value for x in expected_modes], LoggingColumns.ModelOutput.value: [0.1, 0.2, 0.3, 0.4], LoggingColumns.Label.value: expected_labels, LoggingColumns.Patient.value: expected_subjects }) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df(test_df, is_classification_metrics=True) for x in set(expected_modes): for e in set(expected_epochs): expected_df = test_df[ (test_df[LoggingColumns.DataSplit.value] == x.value) & (test_df[LoggingColumns.Epoch.value] == e)] metrics_dict = metrics[x][e] assert np.alltrue(expected_df[LoggingColumns.ModelOutput.value].values == metrics_dict.get_predictions()) assert np.alltrue(expected_df[LoggingColumns.Label.value].values == metrics_dict.get_labels()) assert np.alltrue(expected_df[LoggingColumns.Patient.value].values == metrics_dict.subject_ids())