def get_epoch_path(mode: ModelExecutionMode) -> Path:
     p = get_epoch_results_path(mode=mode)
     return config.outputs_folder / p / SUBJECT_METRICS_FILE_NAME
def test_train_classification_model(
        class_name: str, test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of classification models, asserting on the individual results from training and
    testing.
    Expected test results are stored for GPU with and without mixed precision.
    """
    logging_to_stdout(logging.DEBUG)
    config = ClassificationModelForTesting()
    config.class_names = [class_name]
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=Path(test_output_dirs.root_dir))
    # Train for 4 epochs, checkpoints at epochs 2 and 4
    config.num_epochs = 4
    model_training_result = model_training.model_train(
        config, checkpoint_handler=checkpoint_handler)
    assert model_training_result is not None
    expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
    expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
    expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
    # Ensure that all metrics are computed on both training and validation set
    assert len(
        model_training_result.train_results_per_epoch) == config.num_epochs
    assert len(
        model_training_result.val_results_per_epoch) == config.num_epochs
    assert len(model_training_result.train_results_per_epoch[0]) >= 11
    assert len(model_training_result.val_results_per_epoch[0]) >= 11

    for metric in [
            MetricType.ACCURACY_AT_THRESHOLD_05,
            MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
            MetricType.AREA_UNDER_PR_CURVE, MetricType.AREA_UNDER_ROC_CURVE,
            MetricType.CROSS_ENTROPY, MetricType.LOSS,
            MetricType.SECONDS_PER_BATCH, MetricType.SECONDS_PER_EPOCH,
            MetricType.SUBJECT_COUNT
    ]:
        assert metric.value in model_training_result.train_results_per_epoch[0], \
            f"{metric.value} not in training"
        assert metric.value in model_training_result.val_results_per_epoch[0], \
            f"{metric.value} not in validation"

    actual_train_loss = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LOSS.value)
    actual_val_loss = model_training_result.get_metric(
        is_training=False, metric_type=MetricType.LOSS.value)
    actual_lr = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LEARNING_RATE.value)
    assert actual_train_loss == pytest.approx(expected_train_loss,
                                              abs=1e-6), "Training loss"
    assert actual_val_loss == pytest.approx(expected_val_loss,
                                            abs=1e-6), "Validation loss"
    assert actual_lr == pytest.approx(expected_learning_rates,
                                      rel=1e-5), "Learning rates"
    test_results = model_testing.model_test(
        config,
        ModelExecutionMode.TRAIN,
        checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    expected_metrics = [0.636085, 0.735952]
    assert test_results.metrics.values(class_name)[MetricType.CROSS_ENTROPY.value] == \
           pytest.approx(expected_metrics, abs=1e-5)
    # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here
    # we want to mostly assert that the files look reasonable
    if machine_has_gpu:
        return

    # Check epoch_metrics.csv
    epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME
    # Auto-format will break the long header line, hence the strange way of writing it!
    expected_epoch_metrics = \
        f"{LoggingColumns.Loss.value},{LoggingColumns.CrossEntropy.value}," \
        f"{LoggingColumns.AccuracyAtThreshold05.value},{LoggingColumns.LearningRate.value}," + \
        f"{LoggingColumns.AreaUnderRocCurve.value}," \
        f"{LoggingColumns.AreaUnderPRCurve.value}," \
        f"{LoggingColumns.AccuracyAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalsePositiveRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalseNegativeRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.OptimalThreshold.value}," \
        f"{LoggingColumns.SubjectCount.value},{LoggingColumns.Epoch.value}," \
        f"{LoggingColumns.CrossValidationSplitIndex.value}\n" + \
        """0.6866141557693481,0.6866141557693481,0.5,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1	
        0.6864652633666992,0.6864652633666992,0.5,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1	
        0.6863163113594055,0.6863162517547607,0.5,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1	
        0.6861673593521118,0.6861673593521118,0.5,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1	
        """
    check_log_file(epoch_metrics_path,
                   expected_epoch_metrics,
                   ignore_columns=[])
    # Check metrics.csv: This contains the per-subject per-epoch model outputs
    # Randomization comes out slightly different on Windows, hence only execute the test on Linux
    if common_util.is_windows():
        return
    metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME
    metrics_expected = \
        f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
0,S2,{class_name},0.529514,1,Train,-1
0,S4,{class_name},0.521659,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
1,S2,{class_name},0.529475,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
2,S2,{class_name},0.529437,1,Train,-1
3,S2,{class_name},0.529399,1,Train,-1
3,S4,{class_name},0.521128,0,Train,-1
"""
    check_log_file(metrics_path, metrics_expected, ignore_columns=[])
    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
    # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
    inference_metrics_path = config.outputs_folder / get_epoch_results_path(ModelExecutionMode.TRAIN) / \
                             SUBJECT_METRICS_FILE_NAME
    inference_metrics_expected = \
        f"""prediction_target,subject,model_output,label,cross_validation_split_index,data_split
{class_name},S2,0.5293986201286316,1.0,-1,Train
{class_name},S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(inference_metrics_path,
                   inference_metrics_expected,
                   ignore_columns=[])
Example #3
0
def test_model_test(test_output_dirs: OutputFolderForTests) -> None:
    train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")

    config = DummyModel()
    config.set_output_to(test_output_dirs.root_dir)
    epoch = 1
    config.num_epochs = epoch
    assert config.get_test_epochs() == [epoch]
    placeholder_dataset_id = "place_holder_dataset_id"
    config.azure_dataset_id = placeholder_dataset_id
    transform = config.get_full_image_sample_transforms().test
    df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
    df = df[df.subject.isin([1, 2])]
    # noinspection PyTypeHints
    config._datasets_for_inference = \
        {ModelExecutionMode.TEST: FullImageDataset(config, df, full_image_sample_transforms=transform)}  # type: ignore
    execution_mode = ModelExecutionMode.TEST
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)
    # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
    stored_checkpoints = full_ml_test_data_path("checkpoints")
    shutil.copytree(str(stored_checkpoints), str(config.checkpoint_folder))
    checkpoint_handler.additional_training_done()
    inference_results = model_testing.segmentation_model_test(config,
                                                              data_split=execution_mode,
                                                              checkpoint_handler=checkpoint_handler)
    epoch_dir = config.outputs_folder / get_epoch_results_path(epoch, execution_mode)
    assert inference_results.epochs[epoch] == pytest.approx(0.66606902, abs=1e-6)

    assert config.outputs_folder.is_dir()
    assert epoch_dir.is_dir()
    patient1 = io_util.load_nifti_image(train_and_test_data_dir / "id1_channel1.nii.gz")
    patient2 = io_util.load_nifti_image(train_and_test_data_dir / "id2_channel1.nii.gz")

    assert_file_contains_string(epoch_dir / DATASET_ID_FILE, placeholder_dataset_id)
    assert_file_contains_string(epoch_dir / GROUND_TRUTH_IDS_FILE, "region")
    assert_text_files_match(epoch_dir / model_testing.METRICS_FILE_NAME,
                            train_and_test_data_dir / model_testing.METRICS_FILE_NAME)
    assert_text_files_match(epoch_dir / model_testing.METRICS_AGGREGATES_FILE,
                            train_and_test_data_dir / model_testing.METRICS_AGGREGATES_FILE)
    # Plotting results vary between platforms. Can only check if the file is generated, but not its contents.
    assert (epoch_dir / model_testing.BOXPLOT_FILE).exists()

    assert_nifti_content(epoch_dir / "001" / "posterior_region.nii.gz", get_image_shape(patient1),
                         patient1.header,
                         [136], np.ubyte)
    assert_nifti_content(epoch_dir / "002" / "posterior_region.nii.gz", get_image_shape(patient2),
                         patient2.header,
                         [136], np.ubyte)
    assert_nifti_content(epoch_dir / "001" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient1),
                         patient1.header,
                         [1], np.ubyte)
    assert_nifti_content(epoch_dir / "002" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient2),
                         patient2.header,
                         [1], np.ubyte)
    assert_nifti_content(epoch_dir / "001" / "posterior_background.nii.gz", get_image_shape(patient1),
                         patient1.header,
                         [118], np.ubyte)
    assert_nifti_content(epoch_dir / "002" / "posterior_background.nii.gz", get_image_shape(patient2),
                         patient2.header,
                         [118], np.ubyte)
    thumbnails_folder = epoch_dir / model_testing.THUMBNAILS_FOLDER
    assert thumbnails_folder.is_dir()
    png_files = list(thumbnails_folder.glob("*.png"))
    overlays = [f for f in png_files if "_region_slice_" in str(f)]
    assert len(overlays) == len(df.subject.unique()), "There should be one overlay/contour file per subject"

    # Writing dataset.csv normally happens at the beginning of training,
    # but this test reads off a saved checkpoint file.
    # Dataset.csv must be present for plot_cross_validation.
    config.write_dataset_files()
    # Test if the metrics files can be picked up correctly by the cross validation code
    config_and_files = get_config_and_results_for_offline_runs(config)
    result_files = config_and_files.files
    assert len(result_files) == 1
    for file in result_files:
        assert file.execution_mode == execution_mode
        assert file.dataset_csv_file is not None
        assert file.dataset_csv_file.exists()
        assert file.metrics_file is not None
        assert file.metrics_file.exists()
Example #4
0
def test_train_classification_model(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of classification models, asserting on the individual results from training and
    testing.
    Expected test results are stored for GPU with and without mixed precision.
    """
    logging_to_stdout(logging.DEBUG)
    config = ClassificationModelForTesting()
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=Path(test_output_dirs.root_dir))
    # Train for 4 epochs, checkpoints at epochs 2 and 4
    config.num_epochs = 4
    model_training_result = model_training.model_train(
        config, checkpoint_handler=checkpoint_handler)
    assert model_training_result is not None
    expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
    expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
    expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
    # Ensure that all metrics are computed on both training and validation set
    assert len(
        model_training_result.train_results_per_epoch) == config.num_epochs
    assert len(
        model_training_result.val_results_per_epoch) == config.num_epochs
    assert len(model_training_result.train_results_per_epoch[0]) >= 11
    assert len(model_training_result.val_results_per_epoch[0]) >= 11
    for metric in [
            MetricType.ACCURACY_AT_THRESHOLD_05,
            MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
            MetricType.AREA_UNDER_PR_CURVE,
            MetricType.AREA_UNDER_ROC_CURVE,
            MetricType.CROSS_ENTROPY,
            MetricType.LOSS,
            # For unknown reasons, we don't get seconds_per_batch for the training data.
            # MetricType.SECONDS_PER_BATCH,
            MetricType.SECONDS_PER_EPOCH,
            MetricType.SUBJECT_COUNT,
    ]:
        assert metric.value in model_training_result.train_results_per_epoch[
            0], f"{metric.value} not in training"
        assert metric.value in model_training_result.val_results_per_epoch[
            0], f"{metric.value} not in validation"
    actual_train_loss = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LOSS.value)
    actual_val_loss = model_training_result.get_metric(
        is_training=False, metric_type=MetricType.LOSS.value)
    actual_lr = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LEARNING_RATE.value)
    assert actual_train_loss == pytest.approx(expected_train_loss,
                                              abs=1e-6), "Training loss"
    assert actual_val_loss == pytest.approx(expected_val_loss,
                                            abs=1e-6), "Validation loss"
    assert actual_lr == pytest.approx(expected_learning_rates,
                                      rel=1e-5), "Learning rates"
    test_results = model_testing.model_test(
        config,
        ModelExecutionMode.TRAIN,
        checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    expected_metrics = [0.636085, 0.735952]
    assert test_results.metrics.values()[MetricType.CROSS_ENTROPY.value] == \
           pytest.approx(expected_metrics, abs=1e-5)
    # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here
    # we want to mostly assert that the files look reasonable
    if machine_has_gpu:
        return
    # Check epoch_metrics.csv
    epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME
    # Auto-format will break the long header line, hence the strange way of writing it!
    expected_epoch_metrics = \
        "loss,cross_entropy,accuracy_at_threshold_05,seconds_per_epoch,learning_rate," + \
        "area_under_roc_curve,area_under_pr_curve,accuracy_at_optimal_threshold," \
        "false_positive_rate_at_optimal_threshold,false_negative_rate_at_optimal_threshold," \
        "optimal_threshold,subject_count,epoch,cross_validation_split_index\n" + \
        """0.6866141557693481,0.6866141557693481,0.5,0,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1	
        0.6864652633666992,0.6864652633666992,0.5,0,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1	
        0.6863163113594055,0.6863162517547607,0.5,0,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1	
        0.6861673593521118,0.6861673593521118,0.5,0,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1	
        """
    # We cannot compare columns like "seconds_per_epoch" because timing will obviously vary between machines.
    # Column must still be present, though.
    check_log_file(epoch_metrics_path,
                   expected_epoch_metrics,
                   ignore_columns=[LoggingColumns.SecondsPerEpoch.value])
    # Check metrics.csv: This contains the per-subject per-epoch model outputs
    # Randomization comes out slightly different on Windows, hence only execute the test on Linux
    if common_util.is_windows():
        return
    metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME
    metrics_expected = \
        """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split
Default,0,S2,0.5295137763023376,1.0,-1,Train
Default,0,S4,0.5216594338417053,0.0,-1,Train
Default,1,S4,0.5214819312095642,0.0,-1,Train
Default,1,S2,0.5294750332832336,1.0,-1,Train
Default,2,S2,0.5294366478919983,1.0,-1,Train
Default,2,S4,0.5213046073913574,0.0,-1,Train
Default,3,S2,0.5293986201286316,1.0,-1,Train
Default,3,S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(metrics_path, metrics_expected, ignore_columns=[])
    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
    # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
    inference_metrics_path = config.outputs_folder / get_epoch_results_path(ModelExecutionMode.TRAIN) / \
                             SUBJECT_METRICS_FILE_NAME
    inference_metrics_expected = \
        """prediction_target,subject,model_output,label,cross_validation_split_index,data_split
Default,S2,0.5293986201286316,1.0,-1,Train
Default,S4,0.5211275815963745,0.0,-1,Train
"""
    check_log_file(inference_metrics_path,
                   inference_metrics_expected,
                   ignore_columns=[])
def classification_model_test(config: ScalarModelBase,
                              data_split: ModelExecutionMode,
                              checkpoint_handler: CheckpointHandler,
                              model_proc: ModelProcessing) -> InferenceMetricsForClassification:
    """
    The main testing loop for classification models. It runs a loop over all epochs for which testing should be done.
    It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
    :param config: The model configuration.
    :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir,
                       used mainly in model evaluation using different dataset splits.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
    :param model_proc: whether we are testing an ensemble or single model
    :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs.
    """

    def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]:
        pipeline = create_inference_pipeline(config=config,
                                             checkpoint_paths=checkpoint_paths)

        if pipeline is None:
            return None

        # for mypy
        assert isinstance(pipeline, ScalarInferencePipelineBase)

        ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing")
        ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
            shuffle=False,
            batch_size=1,
            num_dataload_workers=0
        )

        logging.info(f"Starting to evaluate model on {data_split.value} set.")
        metrics_dict = create_metrics_dict_for_scalar_models(config)
        for sample in ds:
            result = pipeline.predict(sample)
            model_output = result.posteriors
            label = result.labels.to(device=model_output.device)
            sample_id = result.subject_ids[0]
            compute_scalar_metrics(metrics_dict,
                                   subject_ids=[sample_id],
                                   model_output=model_output,
                                   labels=label,
                                   loss_type=config.loss_type)
            logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")

        average = metrics_dict.average(across_hues=False)
        logging.info(average.to_string())

        return metrics_dict

    checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()

    if not checkpoints_to_test:
        raise ValueError("There were no checkpoints available for model testing.")

    result = test_epoch(checkpoint_paths=checkpoints_to_test)
    if result is None:
        raise ValueError("There was no single checkpoint file available for model testing.")
    else:
        if isinstance(result, ScalarMetricsDict):
            results_folder = config.outputs_folder / get_epoch_results_path(data_split, model_proc)
            csv_file = results_folder / SUBJECT_METRICS_FILE_NAME

            logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}")

            # If we are running inference after a training run, the validation set metrics may have been written
            # during train time. If this is not the case, or we are running on the test set, create the metrics
            # file.
            if not csv_file.exists():
                os.makedirs(str(results_folder), exist_ok=False)
                df_logger = DataframeLogger(csv_file)

                # cross validation split index not relevant during test time
                result.store_metrics_per_subject(df_logger=df_logger,
                                                 mode=data_split)
                # write to disk
                df_logger.flush()

    return InferenceMetricsForClassification(metrics=result)
 def get_epoch_path(mode: ModelExecutionMode) -> Path:
     p = get_epoch_results_path(best_epoch, mode=mode, model_proc=model_proc)
     return config.outputs_folder / p / METRICS_FILE_NAME
def classification_model_test(
        config: ScalarModelBase, data_split: ModelExecutionMode,
        run_recovery: Optional[RunRecovery],
        model_proc: ModelProcessing) -> InferenceMetricsForClassification:
    """
    The main testing loop for classification models. It runs a loop over all epochs for which testing should be done.
    It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
    :param config: The model configuration.
    :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir,
                       used mainly in model evaluation using different dataset splits.
    :param run_recovery: RunRecovery data if applicable
    :param model_proc: whether we are testing an ensemble or single model
    :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs.
    """
    def test_epoch(
            test_epoch: int,
            run_recovery: Optional[RunRecovery]) -> Optional[MetricsDict]:
        pipeline = create_inference_pipeline(config, test_epoch, run_recovery)

        if pipeline is None:
            return None

        # for mypy
        assert isinstance(pipeline, ScalarInferencePipelineBase)

        ml_util.set_random_seed(config.get_effective_random_seed(),
                                "Model Testing")
        ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
            shuffle=False, batch_size=1, num_dataload_workers=0)

        logging.info(
            f"Starting to evaluate model from epoch {test_epoch} on {data_split.value} set."
        )
        metrics_dict = create_metrics_dict_from_config(config)
        for sample in ds:
            result = pipeline.predict(sample)
            # Since batch size is 1, we only have 1 item in each of the fields in result
            sample_id, label_gpu, model_output = result.subject_ids[
                0], result.labels, result.model_outputs

            compute_scalar_metrics(metrics_dict, [sample_id], model_output,
                                   label_gpu, config.loss_type)
            logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")

        average = metrics_dict.average(across_hues=False)
        logging.info(average.to_string())

        return metrics_dict

    results: Dict[int, MetricsDict] = {}
    for epoch in config.get_test_epochs():
        epoch_result = test_epoch(test_epoch=epoch, run_recovery=run_recovery)
        if epoch_result is None:
            logging.warning(
                "There is no checkpoint file for epoch {}".format(epoch))
        else:
            results[epoch] = epoch_result

            if isinstance(epoch_result, ScalarMetricsDict):
                epoch_folder = config.outputs_folder / get_epoch_results_path(
                    epoch, data_split, model_proc)
                csv_file = epoch_folder / METRICS_FILE_NAME

                logging.info(
                    f"Writing {data_split.value} metrics to file {str(csv_file)}"
                )

                # If we are running inference after a training run, the validation set metrics may have been written
                # during train time. If this is not the case, or we are running on the test set, create the metrics
                # file.
                if not csv_file.exists():
                    os.makedirs(str(epoch_folder), exist_ok=False)
                    df_logger = DataframeLogger(csv_file)

                    # cross validation split index not relevant during test time
                    epoch_result.store_metrics_per_subject(epoch=epoch,
                                                           df_logger=df_logger,
                                                           mode=data_split)
                    # write to disk
                    df_logger.flush()

    if len(results) == 0:
        raise ValueError(
            "There was no single checkpoint file available for model testing.")

    return InferenceMetricsForClassification(epochs=results)