def test_get_recovery_path_train(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) assert checkpoint_handler.get_recovery_or_checkpoint_path_train() is None
def test_use_local_weights_file( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) checkpoint_handler.download_recovery_checkpoints_or_weights() assert not checkpoint_handler.run_recovery assert not checkpoint_handler.local_weights_path # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE # Set a weights_path checkpoint_handler.azure_config.run_recovery_id = "" config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path assert checkpoint_handler.local_weights_path.is_file() # set a local_weights_path config.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) config.local_weights_path = local_weights_path checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path
def test_download_checkpoints(test_output_dirs: OutputFolderForTests, is_ensemble: bool, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id" # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there. config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config) run_to_recover = fetch_run(workspace=runner_config.get_workspace(), run_recovery_id=runner_config.run_recovery_id) expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX if is_ensemble: child_runs = fetch_child_runs(run_to_recover) expected_files = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(x.get_tags()['cross_validation_split_index']) / expected_checkpoint_file for x in child_runs] else: expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file] checkpoint_paths = run_recovery.get_checkpoint_paths(1) if is_ensemble: assert len(run_recovery.checkpoints_roots) == len(expected_files) assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots]) assert len(checkpoint_paths) == len(expected_files) assert all([x in expected_files for x in checkpoint_paths]) else: assert len(checkpoint_paths) == 1 assert checkpoint_paths[0] == expected_files[0] assert all([expected_file.exists() for expected_file in expected_files])
def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided, # so the local weights should be used ignoring any epochs to test local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) checkpoint_handler.container.local_weights_path = [local_weights_path] checkpoint_handler.download_recovery_checkpoints_or_weights() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == local_weights_path checkpoint_handler.additional_training_done() checkpoint_handler.container.checkpoint_folder.mkdir(parents=True) # Copy checkpoint to make it seem like training has happened expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX expected_checkpoint.touch() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == expected_checkpoint
def test_get_local_weights_path_or_download(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) # If the model has neither local_weights_path or weights_url set, should fail. with pytest.raises(ValueError) as ex: checkpoint_handler.get_local_checkpoints_path_or_download() assert "none of model_id, local_weights_path or weights_url is set in the model config." in ex.value.args[0] # If local_weights_path folder exists, get_local_checkpoints_path_or_download should not do anything. local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) checkpoint_handler.container.local_weights_path = [local_weights_path] returned_weights_path = checkpoint_handler.get_local_checkpoints_path_or_download() assert local_weights_path == returned_weights_path[0] # Pointing the model to a URL should trigger a download checkpoint_handler.container.local_weights_path = [] checkpoint_handler.container.weights_url = [EXTERNAL_WEIGHTS_URL_EXAMPLE] downloaded_weights = checkpoint_handler.get_local_checkpoints_path_or_download() expected_path = checkpoint_handler.output_params.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME / \ os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path) assert len(downloaded_weights) == 1 assert downloaded_weights[0].is_file() assert expected_path == downloaded_weights[0] # try again, should not re-download modified_time = downloaded_weights[0].stat().st_mtime downloaded_weights_new = checkpoint_handler.get_local_checkpoints_path_or_download() assert len(downloaded_weights_new) == 1 assert downloaded_weights_new[0].stat().st_mtime == modified_time
def test_get_recovery_path_train( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) assert checkpoint_handler.get_recovery_path_train() is None # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE # Set a weights_url to get checkpoint from checkpoint_handler.azure_config.run_recovery_id = "" checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path assert checkpoint_handler.get_recovery_path_train() == expected_path # Set a local_weights_path to get checkpoint from checkpoint_handler.container.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) checkpoint_handler.container.local_weights_path = local_weights_path checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path assert checkpoint_handler.get_recovery_path_train() == expected_path
def test_use_checkpoint_paths_or_urls(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) checkpoint_handler.download_recovery_checkpoints_or_weights() assert not checkpoint_handler.run_recovery assert not checkpoint_handler.trained_weights_paths # weights from local_weights_path and weights_url will be modified if needed and stored at this location # Set a weights_path checkpoint_handler.azure_config.run_recovery_id = "" checkpoint_handler.container.weights_url = [EXTERNAL_WEIGHTS_URL_EXAMPLE] checkpoint_handler.download_recovery_checkpoints_or_weights() expected_download_path = checkpoint_handler.output_params.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME /\ os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path) assert checkpoint_handler.trained_weights_paths[0] == expected_download_path assert checkpoint_handler.trained_weights_paths[0].is_file() # set a local_weights_path checkpoint_handler.container.weights_url = [] local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) checkpoint_handler.container.local_weights_path = [local_weights_path] checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.trained_weights_paths[0] == local_weights_path assert checkpoint_handler.trained_weights_paths[0].is_file()
def test_download_checkpoints_from_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # Set a run recovery object - non ensemble checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.run_recovery expected_checkpoint_root = config.checkpoint_folder / run_recovery_id.split( ":")[1] expected_paths = [ create_recovery_checkpoint_path(path=expected_checkpoint_root), expected_checkpoint_root / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX ] assert checkpoint_handler.run_recovery.checkpoints_roots == [ expected_checkpoint_root ] for path in expected_paths: assert path.is_file()
def test_get_checkpoints_to_test_single_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # Now set a run recovery object and set the start epoch to 1, so we get one epoch from # run recovery and one from the training checkpoints checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.additional_training_done() checkpoint_handler.download_recovery_checkpoints_or_weights() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX # Copy checkpoint to make it seem like training has happened expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX expected_checkpoint.touch() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == expected_checkpoint
def test_get_checkpoints_to_test( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() manage_recovery = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided, # so the local weights should be used ignoring any epochs to test config.epochs_to_test = [1, 2] local_weights_path = test_output_dirs.root_dir / "exist.pth" stored_checkpoint = create_checkpoint_path( full_ml_test_data_path("checkpoints"), epoch=1) shutil.copyfile(str(stored_checkpoint), local_weights_path) config.local_weights_path = local_weights_path manage_recovery.discover_and_download_checkpoints_from_previous_runs() checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0].epoch == 0 assert checkpoint_and_paths[0].checkpoint_paths == [ manage_recovery.model_config.outputs_folder / WEIGHTS_FILE ] # Now set a run recovery object and set the start epoch to 1, so we get one epoch from # run recovery and one from the training checkpoints manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID config.start_epoch = 1 manage_recovery.additional_training_done() manage_recovery.discover_and_download_checkpoints_from_previous_runs() # Copy checkpoint to make it seem like training has happened stored_checkpoint = create_checkpoint_path( path=full_ml_test_data_path("checkpoints"), epoch=1) expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder, epoch=2) shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint)) checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 2 assert checkpoint_and_paths[0].epoch == 1 assert checkpoint_and_paths[0].checkpoint_paths == [ create_checkpoint_path(path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=1) ] assert checkpoint_and_paths[1].epoch == 2 assert checkpoint_and_paths[1].checkpoint_paths == [ create_checkpoint_path(path=config.checkpoint_folder, epoch=2) ] # This epoch does not exist config.epochs_to_test = [3] checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths is None
def test_copied_properties() -> None: config = ModelConfigBase(should_validate=False) # This field lives in DatasetParams config.azure_dataset_id = "foo" # This field lives in WorkflowParams config.number_of_cross_validation_splits = 5 assert config.perform_cross_validation container = InnerEyeContainer(config) assert container.azure_dataset_id == "foo" assert container.perform_cross_validation
def test_download_recovery_checkpoints_from_ensemble_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id with pytest.raises(ValueError) as ex: checkpoint_handler.download_recovery_checkpoints_or_weights() assert "has child runs" in str(ex)
def test_download_recovery_single_run(test_output_dirs: OutputFolderForTests, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) run_recovery = RunRecovery.download_all_checkpoints_from_run(config, run) # This fails if there is no recovery checkpoint check_single_checkpoint(run_recovery.get_recovery_checkpoint_paths()) check_single_checkpoint(run_recovery.get_best_checkpoint_paths())
def test_config_non_overridable_params() -> None: """ Check error raised if attempt to override non overridable configs """ non_overridable_params = {k: v.default for k, v in ModelConfigBase.params().items() if k not in ModelConfigBase.get_overridable_parameters()} with pytest.raises(ValueError) as ex: ModelConfigBase( should_validate=False, **non_overridable_params ) assert "The following parameters cannot be overriden" in ex.value.args[0]
def test_download_checkpoints_from_hyperdrive_child_runs(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) hyperdrive_run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs(hyperdrive_run) expected_checkpoints = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i) / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)] checkpoint_paths = checkpoint_handler.get_best_checkpoints() assert checkpoint_paths assert len(checkpoint_paths) == 2 assert set(expected_checkpoints) == set(checkpoint_paths)
def test_get_best_checkpoint_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # We have not set a run_recovery, nor have we trained, so this should fail to get a checkpoint with pytest.raises(ValueError) as ex: checkpoint_handler.get_best_checkpoint() assert "no run recovery object provided and no training has been done in this run" in ex.value.args[ 0] run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # We have set a run_recovery_id now, so this should work: Should download all checkpoints that are available # in the run, into a subfolder of the checkpoint folder checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \ / f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}" checkpoint_paths = checkpoint_handler.get_best_checkpoint() assert checkpoint_paths assert len(checkpoint_paths) == 1 assert expected_checkpoint == checkpoint_paths[0] # From now on, the checkpoint handler will think that the run was started from epoch 1. We should pick up # the best checkpoint from the current run, or from the run recovery if the best checkpoint is there # and so no checkpoints have been written in the resumed run. checkpoint_handler.additional_training_done() # go back to non ensemble run recovery checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() config.start_epoch = 1 # There is no checkpoint in the current run - use the one from run_recovery checkpoint_paths = checkpoint_handler.get_best_checkpoint() expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \ / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX assert checkpoint_paths assert len(checkpoint_paths) == 1 assert checkpoint_paths[0] == expected_checkpoint # Copy over checkpoints to make it look like training has happened and a better checkpoint written expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX expected_checkpoint.touch() checkpoint_paths = checkpoint_handler.get_best_checkpoint() assert checkpoint_paths assert len(checkpoint_paths) == 1 assert expected_checkpoint == checkpoint_paths[0]
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID)) # recover child runs separately also to test hyperdrive child run recovery functionality expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX for child in child_runs: expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file] run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child) assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)]) assert all([expected_file.exists() for expected_file in expected_files])
def test_innereye_container_init() -> None: """ Test if the constructor of the InnerEye container copies attributes as expected. """ # The constructor should copy all fields that belong to either WorkflowParams or DatasetParams from the # config object to the container. for (attrib, type_) in [("weights_url", WorkflowParams), ("extra_dataset_mountpoints", DatasetParams)]: config = ModelConfigBase(should_validate=False) assert hasattr(type_, attrib) assert hasattr(config, attrib) setattr(config, attrib, ["foo"]) container = InnerEyeContainer(config) assert getattr(container, attrib) == ["foo"]
def test_get_recovery_path_train_single_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() # Run recovery with start epoch provided should succeed expected_path = get_recovery_checkpoint_path(path=config.checkpoint_folder) assert checkpoint_handler.get_recovery_or_checkpoint_path_train() == expected_path
def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderForTests, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(config, run) other_runs_folder = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME assert other_runs_folder.is_dir() for child in ["0", "1"]: assert (other_runs_folder / child).is_dir(), "Child run folder does not exist" for checkpoint in run_recovery.get_best_checkpoint_paths(): assert checkpoint.is_file(), f"File {checkpoint} does not exist"
def __init__(self, project_root: Path, yaml_config_file: Path, post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None, model_deployment_hook: Optional[ModelDeploymentHookSignature] = None, command_line_args: Optional[List[str]] = None): self.project_root = project_root self.yaml_config_file = yaml_config_file self.post_cross_validation_hook = post_cross_validation_hook self.model_deployment_hook = model_deployment_hook self.command_line_args = command_line_args # model_config and azure_config are placeholders for now, and are set properly when command line args are # parsed. self.model_config: ModelConfigBase = ModelConfigBase(azure_dataset_id="") self.azure_config: AzureConfig = AzureConfig()
def test_download_azureml_dataset( test_output_dirs: OutputFolderForTests) -> None: dataset_name = "test-dataset" config = ModelConfigBase(should_validate=False) azure_config = get_default_azure_config() runner = MLRunner(config, azure_config) runner.project_root = test_output_dirs.root_dir # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail. with pytest.raises(ValueError): runner.mount_or_download_dataset() # Pointing the model to a dataset folder that does not exist should raise an Exception fake_folder = runner.project_root / "foo" runner.model_config.local_dataset = fake_folder with pytest.raises(FileNotFoundError): runner.mount_or_download_dataset() # If the local dataset folder exists, mount_or_download_dataset should not do anything. fake_folder.mkdir() local_dataset = runner.mount_or_download_dataset() assert local_dataset == fake_folder # Pointing the model to a dataset in Azure should trigger a download runner.model_config.local_dataset = None runner.model_config.azure_dataset_id = dataset_name with logging_section("Starting download"): result_path = runner.mount_or_download_dataset() # Download goes into <project_root> / "datasets" / "test_dataset" expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name assert result_path == expected_path assert result_path.is_dir() dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME assert dataset_csv.is_file() # Check that each individual file in the dataset is present for folder in [1, *range(10, 20)]: sub_folder = result_path / str(folder) sub_folder.is_dir() for file in [ "ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord" ]: f = (sub_folder / file).with_suffix(".nii.gz") assert f.is_file()
def test_get_child_paths(is_ensemble: bool, extra_code_directory: str) -> None: checkpoints = checkpoint_paths * 2 if is_ensemble else checkpoint_paths path_to_root = tests_root_directory().parent azure_config = AzureConfig(extra_code_directory=extra_code_directory) fake_model = ModelConfigBase(azure_dataset_id="fake_dataset_id") ml_runner = MLRunner(model_config=fake_model, azure_config=azure_config, project_root=path_to_root) child_paths = ml_runner.get_child_paths(checkpoints) assert fixed_paths.ENVIRONMENT_YAML_FILE_NAME in child_paths assert fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME in child_paths assert str(Path("InnerEye/ML/runner.py")) in child_paths assert str(Path("InnerEye/ML/model_testing.py")) in child_paths assert str(Path("InnerEye/Common/fixed_paths.py")) in child_paths assert str(Path("InnerEye/Common/common_util.py")) in child_paths trm = str(Path("TestsOutsidePackage/test_register_model.py")) if extra_code_directory: assert trm in child_paths else: assert trm not in child_paths assert all([x.relative_to(path_to_root) for x in checkpoints])
def test_get_recovery_path_train( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) assert checkpoint_handler.get_recovery_path_train() is None # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE # Set a weights_url to get checkpoint from checkpoint_handler.azure_config.run_recovery_id = "" config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training." # Set a local_weights_path to get checkpoint from config.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" create_checkpoint_file(local_weights_path) config.local_weights_path = local_weights_path checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
def test_download_model_from_ensemble_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) model_id = get_most_recent_model_id(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) # Set a run recovery object - non ensemble checkpoint_handler.container.model_id = model_id checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.trained_weights_paths expected_model_root = config.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME / FINAL_ENSEMBLE_MODEL_FOLDER model_inference_config = read_model_inference_config(expected_model_root / MODEL_INFERENCE_JSON_FILE_NAME) expected_paths = [expected_model_root / x for x in model_inference_config.checkpoint_paths] assert len(checkpoint_handler.trained_weights_paths) == len(expected_paths) assert set(checkpoint_handler.trained_weights_paths) == set(expected_paths) for path in expected_paths: assert path.is_file()
def test_get_recovery_path_train_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() # We have not set a start_epoch but we are trying to use run_recovery, this should fail with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Run recovery set, but start epoch is 0" in ex.value.args[0] # Run recovery with start epoch provided should succeed config.start_epoch = 20 expected_path = create_recovery_checkpoint_path( path=config.checkpoint_folder / run_recovery_id.split(":")[1]) assert checkpoint_handler.get_recovery_path_train() == expected_path
def test_get_local_weights_path_or_download( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) manage_recovery = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # If the model has neither local_weights_path or weights_url set, should fail. with pytest.raises(ValueError) as ex: manage_recovery.get_local_weights_path_or_download() assert "neither local_weights_path nor weights_url is set in the model config" in ex.value.args[ 0] # If local_weights_path folder exists, get_local_weights_path_or_download should not do anything. local_weights_path = manage_recovery.project_root / "exist.pth" local_weights_path.touch() manage_recovery.model_config.local_weights_path = local_weights_path returned_weights_path = manage_recovery.get_local_weights_path_or_download( ) assert local_weights_path == returned_weights_path # Pointing the model to a URL should trigger a download config.local_weights_path = None config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE downloaded_weights = manage_recovery.get_local_weights_path_or_download() # Download goes into <project_root> / "modelweights" / "resnet18-5c106cde.pth" expected_path = manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \ os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path) assert downloaded_weights assert downloaded_weights.is_file() assert expected_path == downloaded_weights # try again, should not re-download modified_time = downloaded_weights.stat().st_mtime downloaded_weights_new = manage_recovery.get_local_weights_path_or_download( ) assert downloaded_weights_new assert downloaded_weights_new.stat().st_mtime == modified_time
def test_config_with_typo() -> None: with pytest.raises(ValueError) as ex: ModelConfigBase(num_epochsi=100) assert "The following parameters do not exist: ['num_epochsi']" in ex.value.args[ 0]
def test_get_total_number_of_cross_validation_runs() -> None: config = ModelConfigBase(should_validate=False) config.number_of_cross_validation_splits = 2 assert config.perform_cross_validation assert config.get_total_number_of_cross_validation_runs( ) == config.number_of_cross_validation_splits
def test_get_recovery_path_train( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) assert checkpoint_handler.get_recovery_path_train() is None checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() # We have not set a start_epoch but we are trying to use run_recovery, this should fail with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Run recovery set, but start epoch is 0" in ex.value.args[0] # Run recovery with start epoch provided should succeed config.start_epoch = 20 expected_path = create_checkpoint_path( path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=config.start_epoch) assert checkpoint_handler.get_recovery_path_train() == expected_path # set an ensemble run as recovery - not supported checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Found more than one checkpoint for epoch" in ex.value.args[0] # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE # Set a weights_url to get checkpoint from checkpoint_handler.azure_config.run_recovery_id = "" config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training." # Set a local_weights_path to get checkpoint from config.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" stored_checkpoint = create_checkpoint_path( full_ml_test_data_path("checkpoints"), epoch=1) shutil.copyfile(str(stored_checkpoint), local_weights_path) config.local_weights_path = local_weights_path checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."