Ejemplo n.º 1
0
def test_copy_child_paths_to_folder(is_ensemble: bool,
                                    extra_code_directory: str,
                                    test_output_dirs: OutputFolderForTests) -> None:
    azure_config = AzureConfig(extra_code_directory=extra_code_directory)
    fake_model = SegmentationModelBase(should_validate=False)
    fake_model.set_output_to(test_output_dirs.root_dir)
    # To simulate ensemble models, there are two checkpoints, one in the root dir and one in a folder
    checkpoints_absolute, checkpoints_relative = create_checkpoints(fake_model, is_ensemble)
    # Simulate a project root: We can't derive that from the repository root because that might point
    # into Python's package folder
    project_root = Path(__file__).parent.parent
    ml_runner = MLRunner(model_config=fake_model, azure_config=azure_config, project_root=project_root)
    model_folder = test_output_dirs.root_dir / "final"
    ml_runner.copy_child_paths_to_folder(model_folder=model_folder, checkpoint_paths=checkpoints_absolute)
    expected_files = [
        fixed_paths.ENVIRONMENT_YAML_FILE_NAME,
        fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME,
        "InnerEye/ML/runner.py",
        "InnerEye/ML/model_testing.py",
        "InnerEye/Common/fixed_paths.py",
        "InnerEye/Common/common_util.py",
    ]
    for r in checkpoints_relative:
        expected_files.append(f"{CHECKPOINT_FOLDER}/{r}")
    for expected_file in expected_files:
        assert (model_folder / expected_file).is_file(), f"File missing: {expected_file}"
    trm = model_folder / "TestsOutsidePackage/test_register_model.py"
    if extra_code_directory:
        assert trm.is_file()
    else:
        assert not trm.is_file()
Ejemplo n.º 2
0
def test_score_image_dicom_mock_none(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that dicom in and dicom-rt out works.

    Finally there is no mocking and full image scoring is run using the PassThroughModel.

    :param test_output_dirs: Test output directories.
    """
    model_config = PassThroughModel()
    model_config.set_output_to(test_output_dirs.root_dir)
    checkpoint_path = model_config.checkpoint_folder / "checkpoint.ckpt"
    create_model_and_store_checkpoint(model_config, checkpoint_path)

    azure_config = AzureConfig()
    project_root = Path(__file__).parent.parent
    ml_runner = MLRunner(model_config=model_config,
                         azure_config=azure_config,
                         project_root=project_root)
    model_folder = test_output_dirs.root_dir / "final"
    ml_runner.copy_child_paths_to_folder(model_folder=model_folder,
                                         checkpoint_paths=[checkpoint_path])

    zipped_dicom_series_path = zip_dicom_series(model_folder)

    score_pipeline_config = ScorePipelineConfig(
        data_folder=zipped_dicom_series_path.parent,
        model_folder=str(model_folder),
        image_files=[str(zipped_dicom_series_path)],
        result_image_name=HNSEGMENTATION_FILE.name,
        use_gpu=False,
        use_dicom=True)

    segmentation = score_image(score_pipeline_config)
    assert_zip_file_contents(segmentation, HN_DICOM_RT_ZIPPED, model_folder)
Ejemplo n.º 3
0
def model_train_unittest(config: Optional[DeepLearningConfig],
                         dirs: OutputFolderForTests,
                         checkpoint_handler: Optional[CheckpointHandler] = None,
                         lightning_container: Optional[LightningContainer] = None) -> \
        Tuple[StoringLogger, CheckpointHandler]:
    """
    A shortcut for running model training in the unit test suite. It runs training for the given config, with the
    default checkpoint handler initialized to point to the test output folder specified in dirs.
    :param config: The configuration of the model to train.
    :param dirs: The test fixture that provides an output folder for the test.
    :param lightning_container: An optional LightningContainer object that will be pass through to the training routine.
    :param checkpoint_handler: The checkpoint handler that should be used for training. If not provided, it will be
    created via get_default_checkpoint_handler.
    :return: Tuple[StoringLogger, CheckpointHandler]
    """
    runner = MLRunner(model_config=config, container=lightning_container)
    # Setup will set random seeds before model creation, and set the model in the container.
    # It will also set random seeds correctly. Later we use so initialized container.
    # For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
    # because all unit test configs come with their own local dataset already.
    runner.setup(use_mount_or_download_dataset=False)
    if checkpoint_handler is None:
        azure_config = get_default_azure_config()
        checkpoint_handler = CheckpointHandler(azure_config=azure_config,
                                               container=runner.container,
                                               project_root=dirs.root_dir)
    _, storing_logger = model_train(checkpoint_handler=checkpoint_handler,
                                    container=runner.container)
    return storing_logger, checkpoint_handler  # type: ignore
Ejemplo n.º 4
0
def test_model_name_is_set(test_output_dirs: OutputFolderForTests) -> None:
    container = DummyContainerWithModel()
    container.local_dataset = test_output_dirs.root_dir
    runner = MLRunner(model_config=None, container=container)
    runner.setup()
    expected_name = "DummyContainerWithModel"
    assert runner.container._model_name == expected_name
    assert expected_name in str(runner.container.outputs_folder)
Ejemplo n.º 5
0
 def _create_container(
         extra_local_dataset_paths: List[Path] = [],
         extra_azure_dataset_ids: List[str] = []) -> LightningContainer:
     container = DummyContainerWithModel()
     container.local_dataset = test_output_dirs.root_dir
     container.extra_local_dataset_paths = extra_local_dataset_paths  # type: ignore
     container.extra_azure_dataset_ids = extra_azure_dataset_ids
     runner = MLRunner(model_config=None, container=container)
     runner.setup()
     return runner.container
Ejemplo n.º 6
0
def test_optim_params1(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if the optimizer parameters are read correctly for InnerEye configs.
    """
    model = DummyModel()
    model.set_output_to(test_output_dirs.root_dir)
    runner = MLRunner(model_config=model)
    runner.setup()
    lightning_model = runner.container.model
    optim, _ = lightning_model.configure_optimizers()
    assert optim[0].param_groups[0]["lr"] == 1e-3
Ejemplo n.º 7
0
def test_optim_params2(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if the optimizer parameters are read correctly for containers.
    """
    container = DummyContainerWithModel()
    container.local_dataset = test_output_dirs.root_dir
    runner = MLRunner(model_config=None, container=container)
    runner.setup()
    lightning_model = runner.container.model
    optim, _ = lightning_model.configure_optimizers()
    expected_lr = 1e-1
    assert container.l_rate == expected_lr
    assert optim[0].param_groups[0]["lr"] == expected_lr
Ejemplo n.º 8
0
def test_score_image_dicom_mock_run_store(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that dicom in and dicom-rt out works, by mocking out run and store functions.

    This mocks out run_inference and store_as_ubyte_nifti so that init_from_model_inference_json
    is tested in addition to the tests in test_score_image_dicom_mock_all.

    :param test_output_dirs: Test output directories.
    """
    mock_segmentation = {'mock_segmentation': True}
    model_config = DummyModel()
    model_config.set_output_to(test_output_dirs.root_dir)
    checkpoint_path = model_config.checkpoint_folder / "checkpoint.ckpt"
    create_model_and_store_checkpoint(model_config, checkpoint_path)

    azure_config = AzureConfig()
    project_root = Path(__file__).parent.parent
    ml_runner = MLRunner(model_config=model_config,
                         azure_config=azure_config,
                         project_root=project_root)
    model_folder = test_output_dirs.root_dir / "final"
    ml_runner.copy_child_paths_to_folder(model_folder=model_folder,
                                         checkpoint_paths=[checkpoint_path])

    zipped_dicom_series_path = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
    zip_known_dicom_series(zipped_dicom_series_path)

    score_pipeline_config = ScorePipelineConfig(
        data_folder=zipped_dicom_series_path.parent,
        model_folder=str(model_folder),
        image_files=[str(zipped_dicom_series_path)],
        result_image_name=HNSEGMENTATION_FILE.name,
        use_gpu=False,
        use_dicom=True,
        model_id="Dummy:1")

    with mock.patch('score.run_inference',
                    return_value=mock_segmentation) as mock_run_inference:
        with mock.patch(
                'score.store_as_ubyte_nifti',
                return_value=HNSEGMENTATION_FILE) as mock_store_as_ubyte_nifti:
            segmentation = score_image(score_pipeline_config)
            assert_zip_file_contents(segmentation, HN_DICOM_RT_ZIPPED,
                                     model_folder)

    mock_run_inference.assert_called()
    mock_store_as_ubyte_nifti.assert_called()
Ejemplo n.º 9
0
def test_run_ml_with_sequence_model(use_combined_model: bool,
                                    imaging_feature_type: ImagingFeatureType,
                                    test_output_dirs: TestOutputDirectories) -> None:
    """
    Test training and testing of sequence models, when it is started together via run_ml.
    """
    logging_to_stdout()
    config = ToySequenceModel(use_combined_model, imaging_feature_type,
                              should_validate=False, sequence_target_positions=[2, 10])
    config.set_output_to(test_output_dirs.root_dir)
    config.dataset_data_frame = _get_mock_sequence_dataset()
    config.num_epochs = 1
    config.max_batch_grad_cam = 1

    # make sure we are testing with at least one sequence position that will not exist
    # to ensure correct handling of sequences that do not contain all the expected target positions
    assert max(config.sequence_target_positions) > config.dataset_data_frame[config.sequence_column].astype(float).max()

    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, SCAN_SIZE),
                                                      segmentations=np.random.randint(0, 2, SCAN_SIZE))
    with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
        azure_config = get_default_azure_config()
        azure_config.train = True
        MLRunner(config, azure_config).run()
def test_run_ml_with_classification_model(
        test_output_dirs: OutputFolderForTests,
        number_of_offline_cross_validation_splits: int,
        model_name: str) -> None:
    """
    Test training and testing of classification models, when it is started together via run_ml.
    """
    logging_to_stdout()
    azure_config = get_default_azure_config()
    azure_config.train = True
    config: ScalarModelBase = ModelConfigLoader[ScalarModelBase]() \
        .create_model_config_from_name(model_name)
    config.number_of_cross_validation_splits = number_of_offline_cross_validation_splits
    config.set_output_to(test_output_dirs.root_dir)
    # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
    config.max_num_gpus = 1
    MLRunner(config, azure_config).run()
    _check_offline_cross_validation_output_files(config)

    if config.perform_cross_validation:
        # Test that the result files can be correctly picked up by the cross validation routine.
        # For that, we point the downloader to the local results folder. The core download method
        # recognizes run_recovery_id == None as the signal to read from the local_run_results folder.
        config_and_files = get_config_and_results_for_offline_runs(config)
        result_files = config_and_files.files
        # One file for VAL and one for TRAIN for each child run
        assert len(result_files
                   ) == config.get_total_number_of_cross_validation_runs() * 2
        for file in result_files:
            assert file.execution_mode != ModelExecutionMode.TEST
            assert file.dataset_csv_file is not None
            assert file.dataset_csv_file.exists()
            assert file.metrics_file is not None
            assert file.metrics_file.exists()
def test_register_model_skip() -> None:
    """
    If the AzureML workspace can't be read, model registration should be skipped and return None.
    """
    checkpoint_paths = [
        full_ml_test_data_path('checkpoints') / '1_checkpoint.pth.tar'
    ]
    config = get_model_loader().create_model_config_from_name("Lung")
    ml_runner = MLRunner(config, None)
    raises = mock.Mock()
    raises.side_effect = Exception
    with mock.patch.object(AzureConfig, 'get_workspace', raises):
        model, deployment_result = ml_runner.register_segmentation_model(
            model_description="",
            checkpoint_paths=checkpoint_paths,
            model_proc=ModelProcessing.DEFAULT)
    assert model is None
    assert deployment_result is None
Ejemplo n.º 12
0
def test_score_image_dicom_mock_run(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that dicom in and dicom-rt out works, by mocking out only the run scoring function.

    This mocks out run_inference so that store_as_ubyte_nifti
    is tested in addition to the tests in test_score_image_dicom_mock_run_store.

    :param test_output_dirs: Test output directories.
    """
    model_config = DummyModel()
    model_config.set_output_to(test_output_dirs.root_dir)
    checkpoint_path = model_config.checkpoint_folder / "checkpoint.ckpt"
    create_model_and_store_checkpoint(model_config, checkpoint_path)

    azure_config = AzureConfig()
    project_root = Path(__file__).parent.parent
    ml_runner = MLRunner(model_config=model_config,
                         azure_config=azure_config,
                         project_root=project_root)
    model_folder = test_output_dirs.root_dir / "final"
    ml_runner.copy_child_paths_to_folder(model_folder=model_folder,
                                         checkpoint_paths=[checkpoint_path])

    zipped_dicom_series_path = zip_dicom_series(model_folder)

    score_pipeline_config = ScorePipelineConfig(
        data_folder=zipped_dicom_series_path.parent,
        model_folder=str(model_folder),
        image_files=[str(zipped_dicom_series_path)],
        result_image_name=HNSEGMENTATION_FILE.name,
        use_gpu=False,
        use_dicom=True)

    image_with_header = io_util.load_nifti_image(HNSEGMENTATION_FILE)

    with mock.patch(
            'score.run_inference',
            return_value=image_with_header.image) as mock_run_inference:
        segmentation = score_image(score_pipeline_config)
        assert_zip_file_contents(segmentation, HN_DICOM_RT_ZIPPED,
                                 model_folder)

    mock_run_inference.assert_called()
Ejemplo n.º 13
0
 def create_ml_runner(self) -> Any:
     """
     Create and return an ML runner using the attributes of this Runner object.
     """
     # This import statement cannot be at the beginning of the file because it will cause import
     # of packages that are not available inside the azure_runner.yml environment: torch, blobxfer.
     # That is also why we specify the return type as Any rather than MLRunner.
     from InnerEye.ML.run_ml import MLRunner
     return MLRunner(self.model_config, self.azure_config,
                     self.project_root, self.model_deployment_hook)
def test_get_child_paths(is_ensemble: bool, extra_code_directory: str) -> None:
    checkpoints = checkpoint_paths * 2 if is_ensemble else checkpoint_paths
    path_to_root = tests_root_directory().parent
    azure_config = AzureConfig(extra_code_directory=extra_code_directory)
    fake_model = ModelConfigBase(azure_dataset_id="fake_dataset_id")
    ml_runner = MLRunner(model_config=fake_model, azure_config=azure_config, project_root=path_to_root)
    child_paths = ml_runner.get_child_paths(checkpoints)
    assert fixed_paths.ENVIRONMENT_YAML_FILE_NAME in child_paths
    assert fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME in child_paths
    assert str(Path("InnerEye/ML/runner.py")) in child_paths
    assert str(Path("InnerEye/ML/model_testing.py")) in child_paths
    assert str(Path("InnerEye/Common/fixed_paths.py")) in child_paths
    assert str(Path("InnerEye/Common/common_util.py")) in child_paths
    trm = str(Path("TestsOutsidePackage/test_register_model.py"))
    if extra_code_directory:
        assert trm in child_paths
    else:
        assert trm not in child_paths
    assert all([x.relative_to(path_to_root) for x in checkpoints])
Ejemplo n.º 15
0
def main(yaml_file_path: Path) -> None:
    """
    Invoke either by
      * specifying a model, '--model Lung'
      * or specifying dataset and normalization parameters separately: --azure_dataset_id=foo --norm_method=None
    In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
    """
    config, runner_config, args = get_configs(
        SegmentationModelBase(should_validate=False), yaml_file_path)
    local_dataset = MLRunner(
        config, azure_config=runner_config).mount_or_download_dataset(
            config.azure_dataset_id, config.local_dataset)
    assert local_dataset is not None
    dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
    normalizer_config = NormalizeAndVisualizeConfig(**args)
    actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
    image_channel = normalizer_config.image_channel or config.image_channels[0]
    if not image_channel:
        raise ValueError(
            "No image channel selected. Specify a model by name, or use the image_channel argument."
        )
    gt_channel = normalizer_config.gt_channel or config.ground_truth_ids[0]
    if not gt_channel:
        raise ValueError(
            "No GT channel selected. Specify a model by name, or use the gt_channel argument."
        )

    dataset_sources = load_dataset_sources(
        dataframe,
        local_dataset_root_folder=local_dataset,
        image_channels=[image_channel],
        ground_truth_channels=[gt_channel],
        mask_channel=actual_mask_channel)
    result_folder = local_dataset
    if normalizer_config.result_folder is not None:
        result_folder = result_folder / normalizer_config.result_folder
    if not result_folder.is_dir():
        result_folder.mkdir()
    all_patient_ids = [*dataset_sources.keys()]
    if normalizer_config.only_first == 0:
        patient_ids_to_process = all_patient_ids
    else:
        patient_ids_to_process = all_patient_ids[:normalizer_config.only_first]
    args_file = result_folder / ARGS_TXT
    args_file.write_text(" ".join(sys.argv[1:]))
    config_file = result_folder / "config.txt"
    config_file.write_text(str(config))
    normalizer = PhotometricNormalization(config)
    for patient_id in patient_ids_to_process:
        print(f"Starting to process patient {patient_id}")
        images = load_images_from_dataset_source(dataset_sources[patient_id])
        plotting.plot_normalization_result(images,
                                           normalizer,
                                           result_folder,
                                           result_prefix=image_channel)
Ejemplo n.º 16
0
 def create_ml_runner(self) -> MLRunner:
     """
     Create and return an ML runner using the attributes of this Runner object.
     """
     return MLRunner(
         model_config=self.model_config,
         container=self.lightning_container,
         azure_config=self.azure_config,
         project_root=self.project_root,
         post_cross_validation_hook=self.post_cross_validation_hook,
         model_deployment_hook=self.model_deployment_hook)
Ejemplo n.º 17
0
def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None:
    dataset_name = "test-dataset"
    config = DummyModel()
    config.local_dataset = None
    config.azure_dataset_id = ""
    azure_config = get_default_azure_config()
    runner = MLRunner(config, azure_config=azure_config)
    # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
    # This mounting call must happen before any other operations on the container, because already the model
    # creation may need access to the dataset.
    with pytest.raises(ValueError) as ex:
        runner.setup()
    assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
    runner.project_root = test_output_dirs.root_dir

    # Pointing the model to a dataset folder that does not exist should raise an Exception
    fake_folder = runner.project_root / "foo"
    runner.container.local_dataset = fake_folder
    with pytest.raises(FileNotFoundError):
        runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)

    # If the local dataset folder exists, mount_or_download_dataset should not do anything.
    fake_folder.mkdir()
    local_dataset = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
    assert local_dataset == fake_folder

    # Pointing the model to a dataset in Azure should trigger a download
    runner.container.local_dataset = None
    runner.container.azure_dataset_id = dataset_name
    with logging_section("Starting download"):
        result_path = runner.mount_or_download_dataset(runner.container.azure_dataset_id,
                                                       runner.container.local_dataset)
    # Download goes into <project_root> / "datasets" / "test_dataset"
    expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
    assert result_path == expected_path
    assert result_path.is_dir()
    dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
    assert dataset_csv.is_file()
    # Check that each individual file in the dataset is present
    for folder in [1, *range(10, 20)]:
        sub_folder = result_path / str(folder)
        sub_folder.is_dir()
        for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
            f = (sub_folder / file).with_suffix(".nii.gz")
            assert f.is_file()
Ejemplo n.º 18
0
def test_non_image_encoder(
        test_output_dirs: OutputFolderForTests,
        hidden_layer_num_feature_channels: Optional[int]) -> None:
    """
    Test if we can build a simple MLP model that only feeds off non-image features.
    """
    dataset_folder = Path(test_output_dirs.make_sub_dir("dataset"))
    dataset_contents = _get_fake_dataset_contents()
    (dataset_folder / DATASET_CSV_FILE_NAME).write_text(dataset_contents)
    config = NonImageEncoder(
        should_validate=False,
        hidden_layer_num_feature_channels=hidden_layer_num_feature_channels)
    config.local_dataset = dataset_folder
    config.set_output_to(test_output_dirs.root_dir)
    config.max_batch_grad_cam = 1
    config.validate()
    # run model training
    _, checkpoint_handler = model_train_unittest(
        config, output_folder=test_output_dirs)
    # run model inference
    runner = MLRunner(config)
    runner.setup()
    runner.model_inference_train_and_test(
        checkpoint_paths=checkpoint_handler.get_checkpoints_to_test())
    assert config.get_total_number_of_non_imaging_features() == 18
Ejemplo n.º 19
0
def test_download_azureml_dataset(
        test_output_dirs: OutputFolderForTests) -> None:
    dataset_name = "test-dataset"
    config = ModelConfigBase(should_validate=False)
    azure_config = get_default_azure_config()
    runner = MLRunner(config, azure_config)
    runner.project_root = test_output_dirs.root_dir

    # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
    with pytest.raises(ValueError):
        runner.mount_or_download_dataset()

    # Pointing the model to a dataset folder that does not exist should raise an Exception
    fake_folder = runner.project_root / "foo"
    runner.model_config.local_dataset = fake_folder
    with pytest.raises(FileNotFoundError):
        runner.mount_or_download_dataset()

    # If the local dataset folder exists, mount_or_download_dataset should not do anything.
    fake_folder.mkdir()
    local_dataset = runner.mount_or_download_dataset()
    assert local_dataset == fake_folder

    # Pointing the model to a dataset in Azure should trigger a download
    runner.model_config.local_dataset = None
    runner.model_config.azure_dataset_id = dataset_name
    with logging_section("Starting download"):
        result_path = runner.mount_or_download_dataset()
    # Download goes into <project_root> / "datasets" / "test_dataset"
    expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
    assert result_path == expected_path
    assert result_path.is_dir()
    dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
    assert dataset_csv.is_file()
    # Check that each individual file in the dataset is present
    for folder in [1, *range(10, 20)]:
        sub_folder = result_path / str(folder)
        sub_folder.is_dir()
        for file in [
                "ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"
        ]:
            f = (sub_folder / file).with_suffix(".nii.gz")
            assert f.is_file()
Ejemplo n.º 20
0
def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
                                        is_offline_run: bool,
                                        local_dataset: Optional[Path],
                                        azure_dataset: str,
                                        is_lightning_model: bool) -> LightningContainer:
    config: Optional[DeepLearningConfig] = None
    container: Optional[LightningContainer] = None
    if is_lightning_model:
        container = DummyContainerWithDatasets()
        container.azure_dataset_id = azure_dataset
        container.local_dataset = local_dataset
    else:
        config = DummyModel()
        config.azure_dataset_id = azure_dataset
        config.local_dataset = local_dataset
    # The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that.
    download_path = test_output_dirs.root_dir / "downloaded"
    mount_path = test_output_dirs.root_dir / "mounted"
    if not is_lightning_model:
        train_and_test_data = "train_and_test_data"
        for path in [download_path, mount_path, test_output_dirs.root_dir]:
            # If destination folder exists, delete content to ensure consistency and avoid 'FileExistsError'
            if (path / train_and_test_data).is_dir():
                shutil.rmtree(path / train_and_test_data)

            # Creates directory structure and copy data
            shutil.copytree(full_ml_test_data_path(train_and_test_data), path / train_and_test_data)
            # Copy 'data.csv' file
            shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME)

    with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
        with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
            with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
                runner = MLRunner(config, container=container,
                                  azure_config=None, project_root=test_output_dirs.root_dir)
                runner.setup()
                return runner.container
def test_run_ml_with_multi_label_sequence_model(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test training and testing of sequence models that predicts at multiple time points,
    when it is started via run_ml.
    """
    logging_to_stdout()
    config = ToyMultiLabelSequenceModel(should_validate=False)
    assert config.get_target_indices() == [1, 2, 3]
    expected_prediction_targets = [
        f"{SEQUENCE_POSITION_HUE_NAME_PREFIX} {x}" for x in ["01", "02", "03"]
    ]
    _target_indices = config.get_target_indices()
    assert _target_indices is not None
    assert len(_target_indices) == len(expected_prediction_targets)
    metrics_dict = create_metrics_dict_for_scalar_models(config)
    assert metrics_dict.get_hue_names(
        include_default=False) == expected_prediction_targets
    config.set_output_to(test_output_dirs.root_dir)
    # Create a fake dataset directory to make config validation pass
    config.local_dataset = test_output_dirs.root_dir
    config.dataset_data_frame = _get_multi_label_sequence_dataframe()
    config.pre_process_dataset_dataframe()
    config.num_epochs = 1
    config.max_batch_grad_cam = 1
    azure_config = get_default_azure_config()
    azure_config.train = True
    MLRunner(config, azure_config).run()
    # The metrics file should have one entry per epoch per subject per prediction target,
    # for all the 3 prediction targets.
    metrics_file = config.outputs_folder / "Train" / SUBJECT_METRICS_FILE_NAME
    assert metrics_file.exists()
    metrics = pd.read_csv(metrics_file)
    assert LoggingColumns.Patient.value in metrics
    assert LoggingColumns.Epoch.value in metrics
    assert LoggingColumns.Hue.value in metrics
    assert metrics[LoggingColumns.Hue.value].unique().tolist(
    ) == expected_prediction_targets
    group_by_subject = metrics.groupby(
        by=[LoggingColumns.Patient.value, LoggingColumns.Epoch.value])
    expected_prediction_target_lengths = [3, 2, 3, 3]
    for i, x in enumerate(group_by_subject):
        assert len(x[1]) == expected_prediction_target_lengths[i]
    group_by_subject_and_target = metrics.groupby(by=[
        LoggingColumns.Patient.value, LoggingColumns.Epoch.value,
        LoggingColumns.Hue.value
    ])
    for _, group in group_by_subject_and_target:
        assert len(group) == 1
Ejemplo n.º 22
0
def test_file_system_with_subfolders(
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if a subfolder can be created within the output folder structure, for use with cross validation.
    """
    model = DummyModel()
    model.set_output_to(test_output_dirs.root_dir)
    container = InnerEyeContainer(model)
    # File system should be copied from model config to container
    assert container.file_system_config == model.file_system_config
    runner = MLRunner(model_config=model)
    runner.setup()
    assert str(runner.container.outputs_folder).endswith(model.model_name)
    output_subfolder = "foo"
    expected_folder = runner.container.outputs_folder / output_subfolder
    runner = MLRunner(model_config=model, output_subfolder=output_subfolder)
    runner.setup()
    assert runner.container.outputs_folder == expected_folder
def test_regression_test(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that the file comparison for regression tests is actually called in the workflow.
    """
    container = DummyContainerWithModel()
    container.local_dataset = test_output_dirs.root_dir
    container.regression_test_folder = Path(str(uuid.uuid4().hex))
    runner = MLRunner(container=container)
    runner.setup()
    with pytest.raises(ValueError) as ex:
        runner.run()
    assert "Folder with expected files does not exist" in str(ex)
def test_register_model_invalid() -> None:
    ws = get_default_workspace()
    config = get_model_loader().create_model_config_from_name("Lung")
    with pytest.raises(Exception):
        ml_runner = MLRunner(config, None)
        ml_runner.register_segmentation_model(
            run=Run.get_context(),
            workspace=ws,
            best_epoch=0,
            best_epoch_dice=0,
            checkpoint_paths=checkpoint_paths,
            model_proc=ModelProcessing.DEFAULT)
    with pytest.raises(Exception):
        ml_runner = MLRunner(config, get_default_azure_config())
        ml_runner.register_segmentation_model(
            best_epoch=0,
            best_epoch_dice=0,
            checkpoint_paths=checkpoint_paths,
            model_proc=ModelProcessing.DEFAULT)
Ejemplo n.º 25
0
def test_image_encoder_with_segmentation(
        test_output_dirs: OutputFolderForTests, encode_channels_jointly: bool,
        aggregation_type: AggregationType,
        imaging_feature_type: ImagingFeatureType) -> None:
    """
    Test if the image encoder networks can be trained on segmentations from HDF5.
    """
    logging_to_stdout()
    set_random_seed(0)
    scan_size = (6, 64, 60)
    dataset_contents = """subject,channel,path,label
    S1,week0,scan1.h5,
    S1,week1,scan2.h5,True
    S2,week0,scan3.h5,
    S2,week1,scan4.h5,False
    S3,week0,scan5.h5,
    S3,week1,scan6.h5,True
    S4,week0,scan7.h5,
    S4,week1,scan8.h5,True
    """
    config = ImageEncoder(encode_channels_jointly=encode_channels_jointly,
                          imaging_feature_type=imaging_feature_type,
                          should_validate=False,
                          aggregation_type=aggregation_type,
                          scan_size=scan_size)
    # This fails with 16bit precision, saying "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are
    # unsafe to autocast. Many models use a sigmoid layer right before the binary cross entropy layer. In this case,
    # combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits or
    # torch.nn.BCEWithLogitsLoss.  binary_cross_entropy_with_logits and BCEWithLogits are safe to autocast."
    config.use_mixed_precision = False
    config.set_output_to(test_output_dirs.root_dir)
    config.num_epochs = 1
    config.local_dataset = Path()
    config.dataset_data_frame = pd.read_csv(StringIO(dataset_contents),
                                            sep=",",
                                            dtype=str)
    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](
        images=np.zeros(scan_size, dtype=np.float32),
        segmentations=np.ones(scan_size, dtype=np.uint8))
    with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats',
                    return_value=image_and_seg):
        azure_config = get_default_azure_config()
        azure_config.train = True
        MLRunner(config, azure_config).run()
def test_run_ml_with_segmentation_model(
        test_output_dirs: TestOutputDirectories) -> None:
    """
    Test training and testing of segmentation models, when it is started together via run_ml.
    """
    train_config = DummyModel()
    train_config.num_dataload_workers = 0
    train_config.restrict_subjects = "1"
    # Increasing the test crop size should not have any effect on the results.
    # This is for a bug in an earlier version of the code where the wrong execution mode was used to
    # compute the expected mask size at training time.
    train_config.test_crop_size = (75, 75, 75)
    train_config.perform_training_set_inference = False
    train_config.perform_validation_and_test_set_inference = True
    train_config.set_output_to(test_output_dirs.root_dir)
    azure_config = get_default_azure_config()
    azure_config.train = True
    MLRunner(train_config, azure_config).run()
Ejemplo n.º 27
0
def test_image_encoder_with_segmentation(
        test_output_dirs: OutputFolderForTests, encode_channels_jointly: bool,
        aggregation_type: AggregationType,
        imaging_feature_type: ImagingFeatureType) -> None:
    """
    Test if the image encoder networks can be trained on segmentations from HDF5.
    """
    logging_to_stdout()
    set_random_seed(0)
    scan_size = (6, 64, 60)
    dataset_contents = """subject,channel,path,label
    S1,week0,scan1.h5,
    S1,week1,scan2.h5,True
    S2,week0,scan3.h5,
    S2,week1,scan4.h5,False
    S3,week0,scan5.h5,
    S3,week1,scan6.h5,True
    S4,week0,scan7.h5,
    S4,week1,scan8.h5,True
    """
    config = ImageEncoder(encode_channels_jointly=encode_channels_jointly,
                          imaging_feature_type=imaging_feature_type,
                          should_validate=False,
                          aggregation_type=aggregation_type,
                          scan_size=scan_size)
    config.use_mixed_precision = True
    config.set_output_to(test_output_dirs.root_dir)
    config.num_epochs = 1
    config.local_dataset = Path()
    config.dataset_data_frame = pd.read_csv(StringIO(dataset_contents),
                                            sep=",",
                                            dtype=str)
    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](
        images=np.zeros(scan_size, dtype=np.float32),
        segmentations=np.ones(scan_size, dtype=np.uint8))
    with mock.patch("InnerEye.ML.run_ml.is_offline_run_context",
                    return_value=True):
        with mock.patch(
                'InnerEye.ML.utils.io_util.load_image_in_known_formats',
                return_value=image_and_seg):
            azure_config = get_default_azure_config()
            azure_config.train = True
            MLRunner(config, azure_config=azure_config).run()
def test_run_ml_with_classification_model(
        test_output_dirs: TestOutputDirectories,
        number_of_offline_cross_validation_splits: int,
        number_of_cross_validation_splits_per_fold: int,
        model_name: str) -> None:
    """
    Test training and testing of classification models, when it is started together via run_ml.
    """
    logging_to_stdout()
    azure_config = get_default_azure_config()
    azure_config.train = True
    train_config: ScalarModelBase = ModelConfigLoader[ScalarModelBase]() \
        .create_model_config_from_name(model_name)
    train_config.number_of_cross_validation_splits = number_of_offline_cross_validation_splits
    train_config.number_of_cross_validation_splits_per_fold = number_of_cross_validation_splits_per_fold
    train_config.set_output_to(test_output_dirs.root_dir)
    if train_config.perform_sub_fold_cross_validation:
        train_config.local_dataset = full_ml_test_data_path(
            "classification_data_sub_fold_cv")
    MLRunner(train_config, azure_config).run()
    _check_offline_cross_validation_output_files(train_config)

    if train_config.is_regression_model:
        assert (train_config.outputs_folder / "0" /
                "error_plot_4.png").is_file()

    if train_config.perform_cross_validation:
        # Test that the result files can be correctly picked up by the cross validation routine.
        # For that, we point the downloader to the local results folder. The core download method
        # recognizes run_recovery_id == None as the signal to read from the local_run_results folder.
        config_and_files = get_config_and_results_for_offline_runs(
            train_config)
        result_files = config_and_files.files
        # One file for VAL and one for TRAIN for each child run
        assert len(
            result_files
        ) == train_config.get_total_number_of_cross_validation_runs() * 2
        for file in result_files:
            assert file.execution_mode != ModelExecutionMode.TEST
            assert file.dataset_csv_file is not None
            assert file.dataset_csv_file.exists()
            assert file.metrics_file is not None
            assert file.metrics_file.exists()
Ejemplo n.º 29
0
def test_runner_restart(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if starting training from a folder where the checkpoints folder already has recovery checkpoints picks up
    that it is a recovery run. Also checks that we update the start epoch in the config at loading time.
    """
    model_config = DummyClassification()
    model_config.set_output_to(test_output_dirs.root_dir)
    model_config.num_epochs = FIXED_EPOCH + 2
    # We save all checkpoints - if recovery works as expected we should have a new checkpoint for epoch 4, 5.
    model_config.recovery_checkpoint_save_interval = 1
    model_config.recovery_checkpoints_save_last_k = -1
    runner = MLRunner(model_config=model_config)
    runner.setup(use_mount_or_download_dataset=False)
    # Epochs are 0 based for saving
    create_model_and_store_checkpoint(model_config,
                                      runner.container.checkpoint_folder /
                                      f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
                                      f"{FIXED_EPOCH - 1}{CHECKPOINT_SUFFIX}",
                                      weights_only=False)
    azure_config = get_default_azure_config()
    checkpoint_handler = CheckpointHandler(
        azure_config=azure_config,
        container=runner.container,
        project_root=test_output_dirs.root_dir)
    _, storing_logger = model_train(checkpoint_handler=checkpoint_handler,
                                    container=runner.container)
    # We expect to have 4 checkpoints, FIXED_EPOCH (recovery), FIXED_EPOCH+1, FIXED_EPOCH and best.
    assert len(os.listdir(runner.container.checkpoint_folder)) == 4
    assert (runner.container.checkpoint_folder /
            f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
            f"{FIXED_EPOCH - 1}{CHECKPOINT_SUFFIX}").exists()
    assert (runner.container.checkpoint_folder /
            f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
            f"{FIXED_EPOCH}{CHECKPOINT_SUFFIX}").exists()
    assert (runner.container.checkpoint_folder /
            f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
            f"{FIXED_EPOCH + 1}{CHECKPOINT_SUFFIX}").exists()
    assert (runner.container.checkpoint_folder /
            BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).exists()
    # Check that we really restarted epoch from epoch FIXED_EPOCH.
    assert list(storing_logger.epochs) == [FIXED_EPOCH,
                                           FIXED_EPOCH + 1]  # type: ignore
Ejemplo n.º 30
0
def test_run_ml_with_multi_label_sequence_in_crossval(test_output_dirs: TestOutputDirectories) -> None:
    """
    Test training and testing of sequence models that predicts at multiple time points,
    including aggregation of cross validation results.
    """
    logging_to_stdout()
    config = ToyMultiLabelSequenceModel(should_validate=False)
    assert config.get_target_indices() == [1, 2, 3]
    expected_prediction_targets = ["Seq_pos 01", "Seq_pos 02", "Seq_pos 03"]
    target_indices = config.get_target_indices()
    assert target_indices
    assert len(target_indices) == len(expected_prediction_targets)
    config.set_output_to(test_output_dirs.root_dir)
    config.dataset_data_frame = _get_multi_label_sequence_dataframe()
    config.pre_process_dataset_dataframe()
    config.num_epochs = 1
    config.number_of_cross_validation_splits = 2
    azure_config = get_default_azure_config()
    azure_config.train = True
    MLRunner(config, azure_config).run()