Example #1
0
def segmentation_model_test(
    config: SegmentationModelBase,
    execution_mode: ModelExecutionMode,
    checkpoint_paths: List[Path],
    model_proc: ModelProcessing = ModelProcessing.DEFAULT
) -> InferenceMetricsForSegmentation:
    """
    The main testing loop for segmentation models.
    It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
    :param config: The arguments object which has a valid random seed attribute.
    :param execution_mode: Indicates which of the 3 sets (training, test, or validation) is being processed.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization.
    :param model_proc: Whether we are testing an ensemble or single model.
    :param patient_id: String which contains subject identifier.
    :return: InferenceMetric object that contains metrics related for all of the checkpoint epochs.
    """

    epoch_results_folder = config.outputs_folder / get_best_epoch_results_path(
        execution_mode, model_proc)
    # save the datasets.csv used
    config.write_dataset_files(root=epoch_results_folder)
    epoch_and_split = f"{execution_mode.value} set"
    epoch_dice_per_image = segmentation_model_test_epoch(
        config=copy.deepcopy(config),
        execution_mode=execution_mode,
        checkpoint_paths=checkpoint_paths,
        results_folder=epoch_results_folder,
        epoch_and_split=epoch_and_split)
    if epoch_dice_per_image is None:
        raise ValueError(
            "There was no single checkpoint file available for model testing.")
    else:
        epoch_average_dice: float = np.mean(
            epoch_dice_per_image) if len(epoch_dice_per_image) > 0 else 0
        result = epoch_average_dice
        logging.info(f"Mean Dice: {epoch_average_dice:4f}")
        if model_proc == ModelProcessing.ENSEMBLE_CREATION:
            # For the upload, we want the path without the "OTHER_RUNS/ENSEMBLE" prefix.
            name = str(
                get_best_epoch_results_path(execution_mode,
                                            ModelProcessing.DEFAULT))
            PARENT_RUN_CONTEXT.upload_folder(name=name,
                                             path=str(epoch_results_folder))
    return InferenceMetricsForSegmentation(execution_mode=execution_mode,
                                           metrics=result)
def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None:
    """
    Check that metrics.csv and dataset.csv are created after the second epoch, if running on Azure.
    """
    run = get_most_recent_run()
    blob_path = get_best_epoch_results_path(ModelExecutionMode.TEST)
    (comparison_dataset_path, comparison_metrics_path) = get_comparison_baseline_paths(test_output_dirs.root_dir,
                                                                                       blob_path, run,
                                                                                       DATASET_CSV_FILE_NAME)
    assert comparison_dataset_path is not None
    assert comparison_metrics_path is not None
def compare_scores_against_baselines(model_config: SegmentationModelBase,
                                     azure_config: AzureConfig,
                                     model_proc: ModelProcessing) -> None:
    """
    If the model config has any baselines to compare against, loads the metrics.csv file that should just have
    been written for the last epoch of the current run, and its dataset.csv. Do the same for all the baselines,
    whose corresponding files should be in the repository already. For each baseline, call the Wilcoxon signed-rank test
    on pairs consisting of Dice scores from the current model and the baseline, and print out comparisons to
    the Wilcoxon results file.
    """
    # The attribute will only be present for a segmentation model; and it might be None or empty even for that.
    comparison_blob_storage_paths = model_config.comparison_blob_storage_paths
    if not comparison_blob_storage_paths:
        return
    outputs_path = model_config.outputs_folder / get_best_epoch_results_path(
        ModelExecutionMode.TEST, model_proc)
    if not outputs_path.is_dir():
        if not model_config.is_inference_required(model_proc,
                                                  ModelExecutionMode.TEST):
            logging.info(INFERENCE_DISABLED_WARNING)
            return
        raise FileNotFoundError(
            f"Cannot compare scores against baselines: no best epoch results found at {outputs_path}"
        )
    model_metrics_path = outputs_path / SUBJECT_METRICS_FILE_NAME
    model_dataset_path = outputs_path / DATASET_CSV_FILE_NAME
    if not model_dataset_path.exists():
        raise FileNotFoundError(
            f"Not comparing with baselines because no {model_dataset_path} file found for this run"
        )
    if not model_metrics_path.exists():
        raise FileNotFoundError(
            f"Not comparing with baselines because no {model_metrics_path} file found for this run"
        )
    model_metrics_df = pd.read_csv(model_metrics_path)
    model_dataset_df = pd.read_csv(model_dataset_path)
    comparison_result = download_and_compare_scores(
        outputs_path, azure_config, comparison_blob_storage_paths,
        model_dataset_df, model_metrics_df)
    full_metrics_path = str(outputs_path / FULL_METRICS_DATAFRAME_FILE)
    comparison_result.dataframe.to_csv(full_metrics_path)
    if comparison_result.did_comparisons:
        wilcoxon_path = outputs_path / BASELINE_WILCOXON_RESULTS_FILE
        logging.info(
            f"Wilcoxon tests of current {model_proc.value} model against baseline(s), "
            f"written to {wilcoxon_path}:")
        for line in comparison_result.wilcoxon_lines:
            logging.info(line)
        logging.info("End of Wilcoxon test results")
        may_write_lines_to_file(comparison_result.wilcoxon_lines,
                                wilcoxon_path)
    write_to_scatterplot_directory(outputs_path, comparison_result.plots)
def test_generate_custom_report(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that the Covid model report is generated correctly
    (especially when there are NaN values in the hierarchical task).
    """

    model = CovidModel()
    model.set_output_to(test_output_dirs.root_dir)
    report_dir = test_output_dirs.root_dir / "reports"
    report_dir.mkdir()

    train_csv_path = model.outputs_folder / get_best_epoch_results_path(mode=ModelExecutionMode.TRAIN,
                                                                             model_proc=ModelProcessing.DEFAULT) \
                     / MODEL_OUTPUT_CSV
    train_csv_path.parent.mkdir(parents=True)
    train_csv_path.write_text(
        f"""{LoggingColumns.Patient.value},{LoggingColumns.Hue.value},{LoggingColumns.Label.value},{LoggingColumns.ModelOutput.value},{LoggingColumns.CrossValidationSplitIndex.value}
1,CVX0,1,0.7,-1
1,CVX1,0,0.1,-1
1,CVX2,0,0.1,-1
1,CVX3,0,0.1,-1
2,CVX0,0,0.1,-1
2,CVX1,1,0.7,-1
2,CVX2,0,0.1,-1
2,CVX3,0,0.1,-1
3,CVX0,0,0.7,-1
3,CVX1,0,0.1,-1
3,CVX2,1,0.1,-1
3,CVX3,0,0.1,-1
4,CVX0,0,0.0,-1
4,CVX1,0,1.0,-1
4,CVX2,0,0.0,-1
4,CVX3,1,0.0,-1
5,CVX0,0,0.0,-1
5,CVX1,0,0.0,-1
5,CVX2,1,1.0,-1
5,CVX3,0,0.0,-1
6,CVX0,0,0.0,-1
6,CVX1,1,1.0,-1
6,CVX2,0,0.0,-1
6,CVX3,0,0.0,-1
""")

    report_path = model.generate_custom_report(
        report_dir=report_dir, model_proc=ModelProcessing.DEFAULT)
    report_text = report_path.read_text()

    assert report_text == f"""{ModelExecutionMode.TRAIN.value}
Example #5
0
def run_model_inference_train_and_test(
        test_output_dirs: OutputFolderForTests,
        perform_cross_validation: bool,
        inference_on_train_set: Optional[bool] = None,
        inference_on_val_set: Optional[bool] = None,
        inference_on_test_set: Optional[bool] = None,
        ensemble_inference_on_train_set: Optional[bool] = None,
        ensemble_inference_on_val_set: Optional[bool] = None,
        ensemble_inference_on_test_set: Optional[bool] = None,
        model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> None:
    """
    Test running inference produces expected output metrics, files, folders and calls to upload_folder.

    :param test_output_dirs: Test output directories.
    :param perform_cross_validation: Whether to test with cross validation.
    :param inference_on_train_set: Override for inference on train data sets.
    :param inference_on_val_set: Override for inference on validation data sets.
    :param inference_on_test_set: Override for inference on test data sets.
    :param ensemble_inference_on_train_set: Override for ensemble inference on train data sets.
    :param ensemble_inference_on_val_set: Override for ensemble inference on validation data sets.
    :param ensemble_inference_on_test_set: Override for ensemble inference on test data sets.
    :param model_proc: Model processing to test.
    :return: None.
    """
    dummy_model = DummyModel()

    config = PassThroughModel()
    # Copy settings from DummyModel
    config.image_channels = dummy_model.image_channels
    config.ground_truth_ids = dummy_model.ground_truth_ids
    config.ground_truth_ids_display_names = dummy_model.ground_truth_ids_display_names
    config.colours = dummy_model.colours
    config.fill_holes = dummy_model.fill_holes
    config.roi_interpreted_types = dummy_model.roi_interpreted_types

    config.test_crop_size = (16, 16, 16)
    config.number_of_cross_validation_splits = 2 if perform_cross_validation else 0
    config.inference_on_train_set = inference_on_train_set
    config.inference_on_val_set = inference_on_val_set
    config.inference_on_test_set = inference_on_test_set
    config.ensemble_inference_on_train_set = ensemble_inference_on_train_set
    config.ensemble_inference_on_val_set = ensemble_inference_on_val_set
    config.ensemble_inference_on_test_set = ensemble_inference_on_test_set
    # Plotting crashes with random TCL errors on Windows, disable that for Windows PR builds.
    config.is_plotting_enabled = common_util.is_linux()

    config.set_output_to(test_output_dirs.root_dir)
    train_and_test_data_small_dir = test_output_dirs.root_dir / "train_and_test_data_small"
    config.local_dataset = create_train_and_test_data_small_dataset(
        config.test_crop_size, full_ml_test_data_path(), "train_and_test_data",
        train_and_test_data_small_dir, "data")

    checkpoint_path = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    create_model_and_store_checkpoint(config, checkpoint_path)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)
    checkpoint_handler.additional_training_done()

    mock_upload_path = test_output_dirs.root_dir / "mock_upload"
    mock_upload_path.mkdir()

    run = create_mock_run(mock_upload_path, config)

    azure_config = Mock(name='mock_azure_config')
    azure_config.fetch_run.return_value = run

    runner = MLRunner(model_config=config, azure_config=azure_config)

    with mock.patch("InnerEye.ML.model_testing.PARENT_RUN_CONTEXT", run):
        metrics = runner.model_inference_train_and_test(
            checkpoint_paths=checkpoint_handler.get_checkpoints_to_test(),
            model_proc=model_proc)

    if model_proc == ModelProcessing.ENSEMBLE_CREATION:
        # Create a fake ensemble dataset.csv
        dataset_df = create_dataset_df()
        dataset_df.to_csv(config.outputs_folder / DATASET_CSV_FILE_NAME)

        with mock.patch.object(PlotCrossValidationConfig,
                               'azure_config',
                               return_value=azure_config):
            with mock.patch("InnerEye.Azure.azure_util.PARENT_RUN_CONTEXT",
                            run):
                with mock.patch("InnerEye.ML.run_ml.PARENT_RUN_CONTEXT", run):
                    runner.plot_cross_validation_and_upload_results()
                    runner.generate_report(ModelProcessing.ENSEMBLE_CREATION)

    if model_proc == ModelProcessing.DEFAULT:
        named_metrics = {
            ModelExecutionMode.TRAIN: inference_on_train_set,
            ModelExecutionMode.TEST: inference_on_test_set,
            ModelExecutionMode.VAL: inference_on_val_set
        }
    else:
        named_metrics = {
            ModelExecutionMode.TRAIN: ensemble_inference_on_train_set,
            ModelExecutionMode.TEST: ensemble_inference_on_test_set,
            ModelExecutionMode.VAL: ensemble_inference_on_val_set
        }

    error = ''
    expected_upload_folder_count = 0
    for mode, flag in named_metrics.items():
        if mode in metrics:
            metric = metrics[mode]
            assert isinstance(metric, InferenceMetricsForSegmentation)

        if flag is None:
            # No override supplied, calculate the expected default:
            if model_proc == ModelProcessing.DEFAULT:
                if not perform_cross_validation:
                    # If a "normal" run then default to val or test.
                    flag = mode == ModelExecutionMode.TEST
                else:
                    # If an ensemble child then default to never.
                    flag = False
            else:
                # If an ensemble then default to test only.
                flag = mode == ModelExecutionMode.TEST

        if mode in metrics and not flag:
            error = error + f"Error: {mode.value} cannot be not None."
        elif mode not in metrics and flag:
            error = error + f"Error: {mode.value} cannot be None."
        results_folder = config.outputs_folder / get_best_epoch_results_path(
            mode, model_proc)
        folder_exists = results_folder.is_dir()
        assert folder_exists == flag
        if flag and model_proc == ModelProcessing.ENSEMBLE_CREATION:
            expected_upload_folder_count = expected_upload_folder_count + 1
            expected_name = get_best_epoch_results_path(
                mode, ModelProcessing.DEFAULT)
            run.upload_folder.assert_any_call(name=str(expected_name),
                                              path=str(results_folder))
    if len(error):
        raise ValueError(error)

    if model_proc == ModelProcessing.ENSEMBLE_CREATION:
        # The report should have been mock uploaded
        expected_upload_folder_count = expected_upload_folder_count + 1

    assert run.upload_folder.call_count == expected_upload_folder_count
Example #6
0
def test_model_test(test_output_dirs: OutputFolderForTests) -> None:
    train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
    seed_everything(42)
    config = DummyModel()
    config.set_output_to(test_output_dirs.root_dir)
    placeholder_dataset_id = "place_holder_dataset_id"
    config.azure_dataset_id = placeholder_dataset_id
    transform = config.get_full_image_sample_transforms().test
    df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
    df = df[df.subject.isin([1, 2])]
    # noinspection PyTypeHints
    config._datasets_for_inference = \
        {ModelExecutionMode.TEST: FullImageDataset(config, df, full_image_sample_transforms=transform)}  # type: ignore
    execution_mode = ModelExecutionMode.TEST
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)
    # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
    create_model_and_store_checkpoint(
        config,
        config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
    checkpoint_handler.additional_training_done()
    inference_results = model_testing.segmentation_model_test(
        config,
        data_split=execution_mode,
        checkpoint_handler=checkpoint_handler)
    epoch_dir = config.outputs_folder / get_best_epoch_results_path(
        execution_mode)
    assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)

    assert config.outputs_folder.is_dir()
    assert epoch_dir.is_dir()
    patient1 = io_util.load_nifti_image(train_and_test_data_dir /
                                        "id1_channel1.nii.gz")
    patient2 = io_util.load_nifti_image(train_and_test_data_dir /
                                        "id2_channel1.nii.gz")

    assert_file_contains_string(epoch_dir / DATASET_ID_FILE,
                                placeholder_dataset_id)
    assert_file_contains_string(epoch_dir / GROUND_TRUTH_IDS_FILE, "region")
    assert_text_files_match(
        epoch_dir / model_testing.SUBJECT_METRICS_FILE_NAME,
        train_and_test_data_dir / model_testing.SUBJECT_METRICS_FILE_NAME)
    assert_text_files_match(
        epoch_dir / model_testing.METRICS_AGGREGATES_FILE,
        train_and_test_data_dir / model_testing.METRICS_AGGREGATES_FILE)
    # Plotting results vary between platforms. Can only check if the file is generated, but not its contents.
    assert (epoch_dir / model_testing.BOXPLOT_FILE).exists()

    assert_nifti_content(epoch_dir / "001" / "posterior_region.nii.gz",
                         get_image_shape(patient1), patient1.header, [137],
                         np.ubyte)
    assert_nifti_content(epoch_dir / "002" / "posterior_region.nii.gz",
                         get_image_shape(patient2), patient2.header, [137],
                         np.ubyte)
    assert_nifti_content(epoch_dir / "001" / DEFAULT_RESULT_IMAGE_NAME,
                         get_image_shape(patient1), patient1.header, [1],
                         np.ubyte)
    assert_nifti_content(epoch_dir / "002" / DEFAULT_RESULT_IMAGE_NAME,
                         get_image_shape(patient2), patient2.header, [1],
                         np.ubyte)
    assert_nifti_content(epoch_dir / "001" / "posterior_background.nii.gz",
                         get_image_shape(patient1), patient1.header, [117],
                         np.ubyte)
    assert_nifti_content(epoch_dir / "002" / "posterior_background.nii.gz",
                         get_image_shape(patient2), patient2.header, [117],
                         np.ubyte)
    thumbnails_folder = epoch_dir / model_testing.THUMBNAILS_FOLDER
    assert thumbnails_folder.is_dir()
    png_files = list(thumbnails_folder.glob("*.png"))
    overlays = [f for f in png_files if "_region_slice_" in str(f)]
    assert len(overlays) == len(df.subject.unique(
    )), "There should be one overlay/contour file per subject"

    # Writing dataset.csv normally happens at the beginning of training,
    # but this test reads off a saved checkpoint file.
    # Dataset.csv must be present for plot_cross_validation.
    config.write_dataset_files()
    # Test if the metrics files can be picked up correctly by the cross validation code
    config_and_files = get_config_and_results_for_offline_runs(config)
    result_files = config_and_files.files
    assert len(result_files) == 1
    for file in result_files:
        assert file.execution_mode == execution_mode
        assert file.dataset_csv_file is not None
        assert file.dataset_csv_file.exists()
        assert file.metrics_file is not None
        assert file.metrics_file.exists()
Example #7
0
def classification_model_test(
        config: ScalarModelBase, data_split: ModelExecutionMode,
        checkpoint_paths: List[Path], model_proc: ModelProcessing,
        cross_val_split_index: int) -> InferenceMetricsForClassification:
    """
    The main testing loop for classification models. It runs a loop over all epochs for which testing should be done.
    It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
    :param config: The model configuration.
    :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir,
                       used mainly in model evaluation using different dataset splits.
    :param checkpoint_paths: Checkpoint paths to initialize model
    :param model_proc: whether we are testing an ensemble or single model
    :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs.
    """
    pipeline = create_inference_pipeline(config=config,
                                         checkpoint_paths=checkpoint_paths)
    if pipeline is None:
        raise ValueError("Inference pipeline could not be created.")
    # for mypy
    assert isinstance(pipeline, ScalarInferencePipelineBase)
    ml_util.set_random_seed(config.get_effective_random_seed(),
                            "Model Testing")
    ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
        shuffle=False, batch_size=1, num_dataload_workers=0)
    logging.info(f"Starting to evaluate model on {data_split.value} set.")
    results_folder = config.outputs_folder / get_best_epoch_results_path(
        data_split, model_proc)
    os.makedirs(str(results_folder), exist_ok=True)
    metrics_dict = create_metrics_dict_for_scalar_models(config)
    output_logger: Optional[DataframeLogger] = DataframeLogger(
        csv_path=results_folder / MODEL_OUTPUT_CSV)

    for sample in ds:
        result = pipeline.predict(sample)
        model_output = result.posteriors
        label = result.labels.to(device=model_output.device)
        sample_id = result.subject_ids[0]
        if output_logger:
            for i in range(len(config.target_names)):
                output_logger.add_record({
                    LoggingColumns.Patient.value:
                    sample_id,
                    LoggingColumns.Hue.value:
                    config.target_names[i],
                    LoggingColumns.Label.value:
                    label[0][i].item(),
                    LoggingColumns.ModelOutput.value:
                    model_output[0][i].item(),
                    LoggingColumns.CrossValidationSplitIndex.value:
                    cross_val_split_index
                })

        compute_scalar_metrics(metrics_dict,
                               subject_ids=[sample_id],
                               model_output=model_output,
                               labels=label,
                               loss_type=config.loss_type)
        logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")
    average = metrics_dict.average(across_hues=False)
    logging.info(average.to_string())
    if isinstance(metrics_dict, ScalarMetricsDict):
        csv_file = results_folder / SUBJECT_METRICS_FILE_NAME
        logging.info(
            f"Writing {data_split.value} metrics to file {str(csv_file)}")
        # If we are running inference after a training run, the validation set metrics may have been written
        # during train time. If this is not the case, or we are running on the test set, create the metrics
        # file.
        if not csv_file.exists():
            df_logger = DataframeLogger(csv_file)
            # For test if ensemble split should be default, else record which fold produced this prediction
            cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
                else cross_val_split_index
            metrics_dict.store_metrics_per_subject(
                df_logger=df_logger,
                mode=data_split,
                cross_validation_split_index=cv_index,
                epoch=BEST_EPOCH_FOLDER_NAME)
            # write to disk
            df_logger.flush()

    if output_logger:
        output_logger.flush()

    return InferenceMetricsForClassification(metrics=metrics_dict)
Example #8
0
 def get_output_csv_path(mode: ModelExecutionMode) -> Path:
     p = get_best_epoch_results_path(mode=mode, model_proc=model_proc)
     return self.outputs_folder / p / MODEL_OUTPUT_CSV
Example #9
0
def _check_offline_cross_validation_output_files(
        train_config: ScalarModelBase) -> None:
    metrics: Dict[ModelExecutionMode, List[pd.DataFrame]] = dict()
    root = Path(train_config.file_system_config.outputs_folder)
    for x in range(train_config.get_total_number_of_cross_validation_runs()):
        expected_outputs_folder = root / str(x)
        assert expected_outputs_folder.exists()
        for m in [
                ModelExecutionMode.TRAIN, ModelExecutionMode.VAL,
                ModelExecutionMode.TEST
        ]:
            if m == ModelExecutionMode.TEST:
                metrics_path = expected_outputs_folder / get_best_epoch_results_path(
                    m) / SUBJECT_METRICS_FILE_NAME
            else:
                metrics_path = expected_outputs_folder / m.value / SUBJECT_METRICS_FILE_NAME
            assert metrics_path.exists()
            split_metrics = pd.read_csv(metrics_path)
            if m in metrics:
                # check that metrics for any two folds is not the same
                assert not any([split_metrics.equals(x) for x in metrics[m]])
            metrics[m] = [split_metrics]
    if train_config.perform_cross_validation:
        # test aggregates are as expected
        aggregate_metrics_path = root / CROSSVAL_RESULTS_FOLDER / METRICS_AGGREGATES_FILE
        assert aggregate_metrics_path.is_file()
        # since we aggregate the outputs of each of the child folds
        # we need to compare the outputs w.r.t to the parent folds
        _dataset_splits = train_config.get_dataset_splits()
        _val_dataset_split_count = len(
            _dataset_splits.val[train_config.subject_column].unique()) + len(
                _dataset_splits.train[train_config.subject_column].unique())
        _test_dataset_split_count = len(
            _dataset_splits.test[train_config.subject_column].unique())
        _aggregates_csv = pd.read_csv(aggregate_metrics_path)
        _aggregates_csv_test = _aggregates_csv.loc[_aggregates_csv[
            LoggingColumns.DataSplit.value] == ModelExecutionMode.TEST.value]
        _aggregates_csv_train_val = _aggregates_csv.loc[_aggregates_csv[
            LoggingColumns.DataSplit.value] != ModelExecutionMode.TEST.value]
        _counts_for_splits_train_val = list(
            _aggregates_csv_train_val[LoggingColumns.SubjectCount.value])
        _counts_for_splits_test = list(
            _aggregates_csv_test[LoggingColumns.SubjectCount.value])
        assert all([
            x == _val_dataset_split_count for x in _counts_for_splits_train_val
        ])
        assert all([
            x == _test_dataset_split_count *
            train_config.number_of_cross_validation_splits
            for x in _counts_for_splits_test
        ])
        _epochs = list(
            _aggregates_csv_train_val[LoggingColumns.Epoch.value].astype(int))
        # Each epoch is recorded twice once for the training split and once for the validation
        # split
        assert len(_epochs) == train_config.num_epochs * 2
        assert _epochs == list(range(train_config.num_epochs)) * 2
        # Only the validation mode is kept for unrolled aggregates
        unrolled = unroll_aggregate_metrics(_aggregates_csv)
        if train_config.is_classification_model:
            expected_metrics = {
                LoggingColumns.CrossEntropy.value,
                LoggingColumns.AreaUnderPRCurve.value,
                LoggingColumns.AreaUnderRocCurve.value,
                LoggingColumns.FalseNegativeRateAtOptimalThreshold.value,
                LoggingColumns.FalsePositiveRateAtOptimalThreshold.value,
                LoggingColumns.AccuracyAtOptimalThreshold.value,
                LoggingColumns.OptimalThreshold.value,
                LoggingColumns.AccuracyAtThreshold05.value
            }
        else:
            expected_metrics = {
                LoggingColumns.MeanAbsoluteError.value,
                LoggingColumns.MeanSquaredError.value,
                LoggingColumns.ExplainedVariance.value
            }
        expected_metrics = expected_metrics.union(
            {LoggingColumns.SubjectCount.value})
        assert len(unrolled) == train_config.num_epochs * len(expected_metrics)
        actual_metrics = set(m.metric_name for m in unrolled)
        assert actual_metrics == expected_metrics
        actual_epochs = set(int(m.epoch) for m in unrolled)
        assert actual_epochs == set(_epochs)
Example #10
0
def test_train_classification_model(
        class_name: str, test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of classification models, asserting on the individual results from training and
    testing.
    Expected test results are stored for GPU with and without mixed precision.
    """
    logging_to_stdout(logging.DEBUG)
    config = ClassificationModelForTesting()
    config.class_names = config.target_names = [class_name]
    config.set_output_to(test_output_dirs.root_dir)
    # Train for 4 epochs, checkpoints at epochs 2 and 4
    config.num_epochs = 4
    model_training_result, checkpoint_handler = model_train_unittest(
        config, dirs=test_output_dirs)
    assert model_training_result is not None
    expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
    expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
    expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
    # Ensure that all metrics are computed on both training and validation set
    train_results_per_epoch = model_training_result.train_results_per_epoch()
    val_results_per_epoch = model_training_result.val_results_per_epoch()
    assert len(train_results_per_epoch) == config.num_epochs
    assert len(val_results_per_epoch) == config.num_epochs
    assert len(train_results_per_epoch[0]) >= 11
    assert len(val_results_per_epoch[0]) >= 11

    for metric in [
            MetricType.ACCURACY_AT_THRESHOLD_05,
            MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
            MetricType.AREA_UNDER_PR_CURVE, MetricType.AREA_UNDER_ROC_CURVE,
            MetricType.CROSS_ENTROPY, MetricType.LOSS,
            MetricType.SECONDS_PER_BATCH, MetricType.SECONDS_PER_EPOCH,
            MetricType.SUBJECT_COUNT
    ]:
        assert metric.value in train_results_per_epoch[
            0], f"{metric.value} not in training"
        assert metric.value in val_results_per_epoch[
            0], f"{metric.value} not in validation"

    actual_train_loss = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LOSS.value)
    actual_val_loss = model_training_result.get_metric(
        is_training=False, metric_type=MetricType.LOSS.value)
    actual_lr = model_training_result.get_metric(
        is_training=True, metric_type=MetricType.LEARNING_RATE.value)
    assert actual_train_loss == pytest.approx(expected_train_loss,
                                              abs=1e-6), "Training loss"
    assert actual_val_loss == pytest.approx(expected_val_loss,
                                            abs=1e-6), "Validation loss"
    assert actual_lr == pytest.approx(expected_learning_rates,
                                      rel=1e-5), "Learning rates"
    test_results = model_testing.model_test(
        config,
        ModelExecutionMode.TRAIN,
        checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    expected_metrics = [0.636085, 0.735952]
    assert test_results.metrics.values(class_name)[MetricType.CROSS_ENTROPY.value] == \
           pytest.approx(expected_metrics, abs=1e-5)
    # Run detailed logs file check only on CPU, it will contain slightly different metrics on GPU, but here
    # we want to mostly assert that the files look reasonable
    if machine_has_gpu:
        return

    # Check epoch_metrics.csv
    epoch_metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / EPOCH_METRICS_FILE_NAME
    # Auto-format will break the long header line, hence the strange way of writing it!
    expected_epoch_metrics = \
        f"{LoggingColumns.Loss.value},{LoggingColumns.CrossEntropy.value}," \
        f"{LoggingColumns.AccuracyAtThreshold05.value},{LoggingColumns.LearningRate.value}," + \
        f"{LoggingColumns.AreaUnderRocCurve.value}," \
        f"{LoggingColumns.AreaUnderPRCurve.value}," \
        f"{LoggingColumns.AccuracyAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalsePositiveRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.FalseNegativeRateAtOptimalThreshold.value}," \
        f"{LoggingColumns.OptimalThreshold.value}," \
        f"{LoggingColumns.SubjectCount.value},{LoggingColumns.Epoch.value}," \
        f"{LoggingColumns.CrossValidationSplitIndex.value}\n" + \
        """0.6866141557693481,0.6866141557693481,0.5,0.0001,1.0,1.0,0.5,0.0,0.0,0.529514,2.0,0,-1	
        0.6864652633666992,0.6864652633666992,0.5,9.999712322065557e-05,1.0,1.0,0.5,0.0,0.0,0.529475,2.0,1,-1	
        0.6863163113594055,0.6863162517547607,0.5,9.999306876841536e-05,1.0,1.0,0.5,0.0,0.0,0.529437,2.0,2,-1	
        0.6861673593521118,0.6861673593521118,0.5,9.998613801725043e-05,1.0,1.0,0.5,0.0,0.0,0.529399,2.0,3,-1	
        """
    check_log_file(epoch_metrics_path,
                   expected_epoch_metrics,
                   ignore_columns=[])
    # Check metrics.csv: This contains the per-subject per-epoch model outputs
    # Randomization comes out slightly different on Windows, hence only execute the test on Linux
    if common_util.is_windows():
        return
    metrics_path = config.outputs_folder / ModelExecutionMode.TRAIN.value / SUBJECT_METRICS_FILE_NAME
    metrics_expected = \
        f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
0,S2,{class_name},0.529514,1,Train,-1
0,S4,{class_name},0.521659,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
1,S2,{class_name},0.529475,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
2,S2,{class_name},0.529437,1,Train,-1
3,S2,{class_name},0.529399,1,Train,-1
3,S4,{class_name},0.521128,0,Train,-1
"""
    check_log_file(metrics_path, metrics_expected, ignore_columns=[])
    # Check log METRICS_FILE_NAME inside of the folder best_validation_epoch/Train, which is written when we run model_test.
    # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
    inference_metrics_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
                             SUBJECT_METRICS_FILE_NAME
    inference_metrics_expected = \
        f"""prediction_target,subject,model_output,label,epoch,cross_validation_split_index,data_split
{class_name},S2,0.5293986201286316,1.0,{BEST_EPOCH_FOLDER_NAME},-1,Train
{class_name},S4,0.5211275815963745,0.0,{BEST_EPOCH_FOLDER_NAME},-1,Train
"""
    check_log_file(inference_metrics_path,
                   inference_metrics_expected,
                   ignore_columns=[])

    inference_model_output_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
                                  model_testing.MODEL_OUTPUT_CSV
    inference_model_output_expected = \
        f"""subject,prediction_target,label,model_output,cross_validation_split_index
S2,{class_name},1.000000,0.529399,-1
S4,{class_name},0.000000,0.521128,-1"""
    check_log_file(inference_model_output_path,
                   inference_model_output_expected,
                   ignore_columns=[])
Example #11
0
 def get_epoch_path(mode: ModelExecutionMode) -> Path:
     p = get_best_epoch_results_path(mode=mode)
     return config.outputs_folder / p / SUBJECT_METRICS_FILE_NAME
def test_model_test(test_output_dirs: OutputFolderForTests,
                    use_partial_ground_truth: bool,
                    allow_partial_ground_truth: bool) -> None:
    """
    Check the CSVs (and image files) output by InnerEye.ML.model_testing.segmentation_model_test
    :param test_output_dirs: The fixture in conftest.py
    :param use_partial_ground_truth: Whether to remove some ground truth labels from some test users
    :param allow_partial_ground_truth: What to set the allow_incomplete_labels flag to
    """
    train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
    seed_everything(42)
    config = DummyModel()
    config.allow_incomplete_labels = allow_partial_ground_truth
    config.set_output_to(test_output_dirs.root_dir)
    placeholder_dataset_id = "place_holder_dataset_id"
    config.azure_dataset_id = placeholder_dataset_id
    transform = config.get_full_image_sample_transforms().test
    df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))

    if use_partial_ground_truth:
        config.check_exclusive = False
        config.ground_truth_ids = ["region", "region_1"]

        # As in Tests.ML.pipelines.test.inference.test_evaluate_model_predictions patients 3, 4,
        # and 5 are in the test dataset with:
        # Patient 3 has one missing ground truth channel: "region"
        df = df[df["subject"].ne(3) | df["channel"].ne("region")]
        # Patient 4 has all missing ground truth channels: "region", "region_1"
        df = df[df["subject"].ne(4) | df["channel"].ne("region")]
        df = df[df["subject"].ne(4) | df["channel"].ne("region_1")]
        # Patient 5 has no missing ground truth channels.

        config.dataset_data_frame = df

        df = df[df.subject.isin([3, 4, 5])]

        config.train_subject_ids = ['1', '2']
        config.test_subject_ids = ['3', '4', '5']
        config.val_subject_ids = ['6', '7']
    else:
        df = df[df.subject.isin([1, 2])]

    if use_partial_ground_truth and not allow_partial_ground_truth:
        with pytest.raises(ValueError) as value_error:
            # noinspection PyTypeHints
            config._datasets_for_inference = {
                ModelExecutionMode.TEST:
                FullImageDataset(config,
                                 df,
                                 full_image_sample_transforms=transform)
            }  # type: ignore
        assert "Patient 3 does not have channel 'region'" in str(
            value_error.value)
        return
    else:
        # noinspection PyTypeHints
        config._datasets_for_inference = {
            ModelExecutionMode.TEST:
            FullImageDataset(config,
                             df,
                             full_image_sample_transforms=transform)
        }  # type: ignore
    execution_mode = ModelExecutionMode.TEST
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)
    # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
    create_model_and_store_checkpoint(
        config,
        config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
    checkpoint_handler.additional_training_done()
    inference_results = model_testing.segmentation_model_test(
        config,
        execution_mode=execution_mode,
        checkpoint_paths=checkpoint_handler.get_checkpoints_to_test())
    epoch_dir = config.outputs_folder / get_best_epoch_results_path(
        execution_mode)
    total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower(
    )
    if not total_num_patients_column_name.endswith("s"):
        total_num_patients_column_name += "s"

    if use_partial_ground_truth:
        num_subjects = len(pd.unique(df["subject"]))
        if allow_partial_ground_truth:
            assert csv_column_contains_value(
                csv_file_path=epoch_dir / METRICS_AGGREGATES_FILE,
                column_name=total_num_patients_column_name,
                value=num_subjects,
                contains_only_value=True)
            assert csv_column_contains_value(
                csv_file_path=epoch_dir / SUBJECT_METRICS_FILE_NAME,
                column_name=MetricsFileColumns.Dice.value,
                value='',
                contains_only_value=False)
    else:
        aggregates_df = pd.read_csv(epoch_dir / METRICS_AGGREGATES_FILE)
        assert total_num_patients_column_name not in aggregates_df.columns  # Only added if using partial ground truth

        assert not csv_column_contains_value(
            csv_file_path=epoch_dir / SUBJECT_METRICS_FILE_NAME,
            column_name=MetricsFileColumns.Dice.value,
            value='',
            contains_only_value=False)

        assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
        assert config.outputs_folder.is_dir()
        assert epoch_dir.is_dir()
        patient1 = io_util.load_nifti_image(train_and_test_data_dir /
                                            "id1_channel1.nii.gz")
        patient2 = io_util.load_nifti_image(train_and_test_data_dir /
                                            "id2_channel1.nii.gz")

        assert_file_contains_string(epoch_dir / DATASET_ID_FILE,
                                    placeholder_dataset_id)
        assert_file_contains_string(epoch_dir / GROUND_TRUTH_IDS_FILE,
                                    "region")
        assert_text_files_match(
            epoch_dir / model_testing.SUBJECT_METRICS_FILE_NAME,
            train_and_test_data_dir / model_testing.SUBJECT_METRICS_FILE_NAME)
        assert_text_files_match(
            epoch_dir / model_testing.METRICS_AGGREGATES_FILE,
            train_and_test_data_dir / model_testing.METRICS_AGGREGATES_FILE)
        # Plotting results vary between platforms. Can only check if the file is generated, but not its contents.
        assert (epoch_dir / model_testing.BOXPLOT_FILE).exists()

        assert_nifti_content(epoch_dir / "001" / "posterior_region.nii.gz",
                             get_image_shape(patient1), patient1.header, [137],
                             np.ubyte)
        assert_nifti_content(epoch_dir / "002" / "posterior_region.nii.gz",
                             get_image_shape(patient2), patient2.header, [137],
                             np.ubyte)
        assert_nifti_content(epoch_dir / "001" / DEFAULT_RESULT_IMAGE_NAME,
                             get_image_shape(patient1), patient1.header, [1],
                             np.ubyte)
        assert_nifti_content(epoch_dir / "002" / DEFAULT_RESULT_IMAGE_NAME,
                             get_image_shape(patient2), patient2.header, [1],
                             np.ubyte)
        assert_nifti_content(epoch_dir / "001" / "posterior_background.nii.gz",
                             get_image_shape(patient1), patient1.header, [117],
                             np.ubyte)
        assert_nifti_content(epoch_dir / "002" / "posterior_background.nii.gz",
                             get_image_shape(patient2), patient2.header, [117],
                             np.ubyte)
        thumbnails_folder = epoch_dir / model_testing.THUMBNAILS_FOLDER
        assert thumbnails_folder.is_dir()
        png_files = list(thumbnails_folder.glob("*.png"))
        overlays = [f for f in png_files if "_region_slice_" in str(f)]
        assert len(overlays) == len(df.subject.unique(
        )), "There should be one overlay/contour file per subject"

        # Writing dataset.csv normally happens at the beginning of training,
        # but this test reads off a saved checkpoint file.
        # Dataset.csv must be present for plot_cross_validation.
        config.write_dataset_files()
        # Test if the metrics files can be picked up correctly by the cross validation code
        config_and_files = get_config_and_results_for_offline_runs(config)
        result_files = config_and_files.files
        assert len(result_files) == 1
        for file in result_files:
            assert file.execution_mode == execution_mode
            assert file.dataset_csv_file is not None
            assert file.dataset_csv_file.exists()
            assert file.metrics_file is not None
            assert file.metrics_file.exists()