def model_train_unittest(config: Optional[DeepLearningConfig], dirs: OutputFolderForTests, checkpoint_handler: Optional[CheckpointHandler] = None, lightning_container: Optional[LightningContainer] = None) -> \ Tuple[StoringLogger, CheckpointHandler]: """ A shortcut for running model training in the unit test suite. It runs training for the given config, with the default checkpoint handler initialized to point to the test output folder specified in dirs. :param config: The configuration of the model to train. :param dirs: The test fixture that provides an output folder for the test. :param lightning_container: An optional LightningContainer object that will be pass through to the training routine. :param checkpoint_handler: The checkpoint handler that should be used for training. If not provided, it will be created via get_default_checkpoint_handler. :return: Tuple[StoringLogger, CheckpointHandler] """ runner = MLRunner(model_config=config, container=lightning_container) # Setup will set random seeds before model creation, and set the model in the container. # It will also set random seeds correctly. Later we use so initialized container. # For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen, # because all unit test configs come with their own local dataset already. runner.setup(use_mount_or_download_dataset=False) if checkpoint_handler is None: azure_config = get_default_azure_config() checkpoint_handler = CheckpointHandler(azure_config=azure_config, container=runner.container, project_root=dirs.root_dir) _, storing_logger = model_train(checkpoint_handler=checkpoint_handler, container=runner.container) return storing_logger, checkpoint_handler # type: ignore
def test_non_image_encoder( test_output_dirs: OutputFolderForTests, hidden_layer_num_feature_channels: Optional[int]) -> None: """ Test if we can build a simple MLP model that only feeds off non-image features. """ dataset_folder = Path(test_output_dirs.make_sub_dir("dataset")) dataset_contents = _get_fake_dataset_contents() (dataset_folder / DATASET_CSV_FILE_NAME).write_text(dataset_contents) config = NonImageEncoder( should_validate=False, hidden_layer_num_feature_channels=hidden_layer_num_feature_channels) config.local_dataset = dataset_folder config.set_output_to(test_output_dirs.root_dir) config.max_batch_grad_cam = 1 config.validate() # run model training _, checkpoint_handler = model_train_unittest( config, output_folder=test_output_dirs) # run model inference runner = MLRunner(config) runner.setup() runner.model_inference_train_and_test( checkpoint_paths=checkpoint_handler.get_checkpoints_to_test()) assert config.get_total_number_of_non_imaging_features() == 18
def test_model_name_is_set(test_output_dirs: OutputFolderForTests) -> None: container = DummyContainerWithModel() container.local_dataset = test_output_dirs.root_dir runner = MLRunner(model_config=None, container=container) runner.setup() expected_name = "DummyContainerWithModel" assert runner.container._model_name == expected_name assert expected_name in str(runner.container.outputs_folder)
def _create_container( extra_local_dataset_paths: List[Path] = [], extra_azure_dataset_ids: List[str] = []) -> LightningContainer: container = DummyContainerWithModel() container.local_dataset = test_output_dirs.root_dir container.extra_local_dataset_paths = extra_local_dataset_paths # type: ignore container.extra_azure_dataset_ids = extra_azure_dataset_ids runner = MLRunner(model_config=None, container=container) runner.setup() return runner.container
def test_optim_params1(test_output_dirs: OutputFolderForTests) -> None: """ Test if the optimizer parameters are read correctly for InnerEye configs. """ model = DummyModel() model.set_output_to(test_output_dirs.root_dir) runner = MLRunner(model_config=model) runner.setup() lightning_model = runner.container.model optim, _ = lightning_model.configure_optimizers() assert optim[0].param_groups[0]["lr"] == 1e-3
def test_regression_test(test_output_dirs: OutputFolderForTests) -> None: """ Test that the file comparison for regression tests is actually called in the workflow. """ container = DummyContainerWithModel() container.local_dataset = test_output_dirs.root_dir container.regression_test_folder = Path(str(uuid.uuid4().hex)) runner = MLRunner(container=container) runner.setup() with pytest.raises(ValueError) as ex: runner.run() assert "Folder with expected files does not exist" in str(ex)
def test_optim_params2(test_output_dirs: OutputFolderForTests) -> None: """ Test if the optimizer parameters are read correctly for containers. """ container = DummyContainerWithModel() container.local_dataset = test_output_dirs.root_dir runner = MLRunner(model_config=None, container=container) runner.setup() lightning_model = runner.container.model optim, _ = lightning_model.configure_optimizers() expected_lr = 1e-1 assert container.l_rate == expected_lr assert optim[0].param_groups[0]["lr"] == expected_lr
def test_container_hooks(test_output_dirs: OutputFolderForTests) -> None: """ Test if the hooks before training are called at the right place and in the right order. """ container = DummyContainerWithHooks() container.local_dataset = test_output_dirs.root_dir runner = MLRunner(model_config=None, container=container) runner.setup() runner.run() # The hooks in DummyContainerWithHooks itself check that the hooks are called in the right order. Here, # only check that they have all been called. for file in [ "global_rank_zero.txt", "local_rank_zero.txt", "all_ranks.txt" ]: assert (runner.container.outputs_folder / file).is_file(), f"Missing file: {file}"
def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None: dataset_name = "test-dataset" config = DummyModel() config.local_dataset = None config.azure_dataset_id = "" azure_config = get_default_azure_config() runner = MLRunner(config, azure_config=azure_config) # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail. # This mounting call must happen before any other operations on the container, because already the model # creation may need access to the dataset. with pytest.raises(ValueError) as ex: runner.setup() assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id." runner.project_root = test_output_dirs.root_dir # Pointing the model to a dataset folder that does not exist should raise an Exception fake_folder = runner.project_root / "foo" runner.container.local_dataset = fake_folder with pytest.raises(FileNotFoundError): runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset) # If the local dataset folder exists, mount_or_download_dataset should not do anything. fake_folder.mkdir() local_dataset = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset) assert local_dataset == fake_folder # Pointing the model to a dataset in Azure should trigger a download runner.container.local_dataset = None runner.container.azure_dataset_id = dataset_name with logging_section("Starting download"): result_path = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset) # Download goes into <project_root> / "datasets" / "test_dataset" expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name assert result_path == expected_path assert result_path.is_dir() dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME assert dataset_csv.is_file() # Check that each individual file in the dataset is present for folder in [1, *range(10, 20)]: sub_folder = result_path / str(folder) sub_folder.is_dir() for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]: f = (sub_folder / file).with_suffix(".nii.gz") assert f.is_file()
def test_file_system_with_subfolders( test_output_dirs: OutputFolderForTests) -> None: """ Test if a subfolder can be created within the output folder structure, for use with cross validation. """ model = DummyModel() model.set_output_to(test_output_dirs.root_dir) container = InnerEyeContainer(model) # File system should be copied from model config to container assert container.file_system_config == model.file_system_config runner = MLRunner(model_config=model) runner.setup() assert str(runner.container.outputs_folder).endswith(model.model_name) output_subfolder = "foo" expected_folder = runner.container.outputs_folder / output_subfolder runner = MLRunner(model_config=model, output_subfolder=output_subfolder) runner.setup() assert runner.container.outputs_folder == expected_folder
def test_model_inference_on_single_run(test_output_dirs: OutputFolderForTests) -> None: falllback_run_id = FALLBACK_HELLO_CONTAINER_RUN files_to_check = ["test_mse.txt", "test_mae.txt"] training_run = get_most_recent_run(fallback_run_id_for_local_execution=falllback_run_id) all_training_files = training_run.get_file_names() for file in files_to_check: assert f"outputs/{file}" in all_training_files, f"{file} is missing" training_folder = test_output_dirs.root_dir / "training" training_folder.mkdir() training_files = [training_folder / file for file in files_to_check] for file, download_path in zip(files_to_check, training_files): training_run.download_file(f"outputs/{file}", output_file_path=str(download_path)) container = HelloContainer() container.set_output_to(test_output_dirs.root_dir) container.model_id = get_most_recent_model_id(fallback_run_id_for_local_execution=falllback_run_id) azure_config = get_default_azure_config() azure_config.train = False ml_runner = MLRunner(container=container, azure_config=azure_config, project_root=test_output_dirs.root_dir) ml_runner.setup() ml_runner.run() inference_files = [container.outputs_folder / file for file in files_to_check] for inference_file in inference_files: assert inference_file.exists(), f"{inference_file} is missing" for training_file, inference_file in zip(training_files, inference_files): training_lines = training_file.read_text().splitlines() inference_lines = inference_file.read_text().splitlines() # We expect all the files we are reading to have a single float value assert len(training_lines) == 1 train_value = float(training_lines[0].strip()) assert len(inference_lines) == 1 inference_value = float(inference_lines[0].strip()) assert inference_value == pytest.approx(train_value, 1e-6)
def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests, is_offline_run: bool, local_dataset: Optional[Path], azure_dataset: str, is_lightning_model: bool) -> LightningContainer: config: Optional[DeepLearningConfig] = None container: Optional[LightningContainer] = None if is_lightning_model: container = DummyContainerWithDatasets() container.azure_dataset_id = azure_dataset container.local_dataset = local_dataset else: config = DummyModel() config.azure_dataset_id = azure_dataset config.local_dataset = local_dataset # The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that. download_path = test_output_dirs.root_dir / "downloaded" mount_path = test_output_dirs.root_dir / "mounted" if not is_lightning_model: train_and_test_data = "train_and_test_data" for path in [download_path, mount_path, test_output_dirs.root_dir]: # If destination folder exists, delete content to ensure consistency and avoid 'FileExistsError' if (path / train_and_test_data).is_dir(): shutil.rmtree(path / train_and_test_data) # Creates directory structure and copy data shutil.copytree(full_ml_test_data_path(train_and_test_data), path / train_and_test_data) # Copy 'data.csv' file shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME) with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run): with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path): with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path): runner = MLRunner(config, container=container, azure_config=None, project_root=test_output_dirs.root_dir) runner.setup() return runner.container