def test_get_recovery_path_train(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)

    assert checkpoint_handler.get_recovery_or_checkpoint_path_train() is None
def test_use_local_weights_file(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()

    # No checkpoint handling options set.
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert not checkpoint_handler.run_recovery
    assert not checkpoint_handler.local_weights_path

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE

    # Set a weights_path
    checkpoint_handler.azure_config.run_recovery_id = ""
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
    assert checkpoint_handler.local_weights_path.is_file()

    # set a local_weights_path
    config.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    config.local_weights_path = local_weights_path
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
Example #3
0
def test_download_checkpoints(test_output_dirs: OutputFolderForTests, is_ensemble: bool,
                              runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
    # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there.
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)

    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID
    run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config)
    run_to_recover = fetch_run(workspace=runner_config.get_workspace(), run_recovery_id=runner_config.run_recovery_id)
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    if is_ensemble:
        child_runs = fetch_child_runs(run_to_recover)
        expected_files = [config.checkpoint_folder
                          / OTHER_RUNS_SUBDIR_NAME
                          / str(x.get_tags()['cross_validation_split_index']) / expected_checkpoint_file
                          for x in child_runs]
    else:
        expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file]

    checkpoint_paths = run_recovery.get_checkpoint_paths(1)
    if is_ensemble:
        assert len(run_recovery.checkpoints_roots) == len(expected_files)
        assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots])
        assert len(checkpoint_paths) == len(expected_files)
        assert all([x in expected_files for x in checkpoint_paths])
    else:
        assert len(checkpoint_paths) == 1
        assert checkpoint_paths[0] == expected_files[0]

    assert all([expected_file.exists() for expected_file in expected_files])
def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                     project_root=test_output_dirs.root_dir)

    # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided,
    # so the local weights should be used ignoring any epochs to test
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    checkpoint_handler.container.local_weights_path = [local_weights_path]
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()
    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0] == local_weights_path

    checkpoint_handler.additional_training_done()
    checkpoint_handler.container.checkpoint_folder.mkdir(parents=True)

    # Copy checkpoint to make it seem like training has happened
    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    expected_checkpoint.touch()
    checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()

    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0] == expected_checkpoint
def test_get_local_weights_path_or_download(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)

    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)

    # If the model has neither local_weights_path or weights_url set, should fail.
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_local_checkpoints_path_or_download()
    assert "none of model_id, local_weights_path or weights_url is set in the model config." in ex.value.args[0]

    # If local_weights_path folder exists, get_local_checkpoints_path_or_download should not do anything.
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    checkpoint_handler.container.local_weights_path = [local_weights_path]
    returned_weights_path = checkpoint_handler.get_local_checkpoints_path_or_download()
    assert local_weights_path == returned_weights_path[0]

    # Pointing the model to a URL should trigger a download
    checkpoint_handler.container.local_weights_path = []
    checkpoint_handler.container.weights_url = [EXTERNAL_WEIGHTS_URL_EXAMPLE]
    downloaded_weights = checkpoint_handler.get_local_checkpoints_path_or_download()
    expected_path = checkpoint_handler.output_params.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME / \
                    os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
    assert len(downloaded_weights) == 1
    assert downloaded_weights[0].is_file()
    assert expected_path == downloaded_weights[0]

    # try again, should not re-download
    modified_time = downloaded_weights[0].stat().st_mtime
    downloaded_weights_new = checkpoint_handler.get_local_checkpoints_path_or_download()
    assert len(downloaded_weights_new) == 1
    assert downloaded_weights_new[0].stat().st_mtime == modified_time
Example #6
0
def test_get_recovery_path_train(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    assert checkpoint_handler.get_recovery_path_train() is None

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE

    # Set a weights_url to get checkpoint from
    checkpoint_handler.azure_config.run_recovery_id = ""
    checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
    assert checkpoint_handler.get_recovery_path_train() == expected_path

    # Set a local_weights_path to get checkpoint from
    checkpoint_handler.container.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    checkpoint_handler.container.local_weights_path = local_weights_path
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
    assert checkpoint_handler.get_recovery_path_train() == expected_path
def test_use_checkpoint_paths_or_urls(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)

    # No checkpoint handling options set.
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)

    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert not checkpoint_handler.run_recovery
    assert not checkpoint_handler.trained_weights_paths

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location

    # Set a weights_path
    checkpoint_handler.azure_config.run_recovery_id = ""
    checkpoint_handler.container.weights_url = [EXTERNAL_WEIGHTS_URL_EXAMPLE]
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    expected_download_path = checkpoint_handler.output_params.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME /\
                             os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
    assert checkpoint_handler.trained_weights_paths[0] == expected_download_path
    assert checkpoint_handler.trained_weights_paths[0].is_file()

    # set a local_weights_path
    checkpoint_handler.container.weights_url = []
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    checkpoint_handler.container.local_weights_path = [local_weights_path]
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.trained_weights_paths[0] == local_weights_path
    assert checkpoint_handler.trained_weights_paths[0].is_file()
def test_download_checkpoints_from_single_run(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)

    # No checkpoint handling options set.
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)
    run_recovery_id = get_most_recent_run_id(
        fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)

    # Set a run recovery object - non ensemble
    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.run_recovery

    expected_checkpoint_root = config.checkpoint_folder / run_recovery_id.split(
        ":")[1]
    expected_paths = [
        create_recovery_checkpoint_path(path=expected_checkpoint_root),
        expected_checkpoint_root / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    ]
    assert checkpoint_handler.run_recovery.checkpoints_roots == [
        expected_checkpoint_root
    ]
    for path in expected_paths:
        assert path.is_file()
def test_get_checkpoints_to_test_single_run(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                     project_root=test_output_dirs.root_dir)

    run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)

    # Now set a run recovery object and set the start epoch to 1, so we get one epoch from
    # run recovery and one from the training checkpoints
    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id

    checkpoint_handler.additional_training_done()
    checkpoint_handler.download_recovery_checkpoints_or_weights()

    checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()

    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0] == config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX

    # Copy checkpoint to make it seem like training has happened
    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    expected_checkpoint.touch()
    checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()

    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0] == expected_checkpoint
def test_get_checkpoints_to_test(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    manage_recovery = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided,
    # so the local weights should be used ignoring any epochs to test
    config.epochs_to_test = [1, 2]
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    stored_checkpoint = create_checkpoint_path(
        full_ml_test_data_path("checkpoints"), epoch=1)
    shutil.copyfile(str(stored_checkpoint), local_weights_path)
    config.local_weights_path = local_weights_path
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0].epoch == 0
    assert checkpoint_and_paths[0].checkpoint_paths == [
        manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
    ]

    # Now set a run recovery object and set the start epoch to 1, so we get one epoch from
    # run recovery and one from the training checkpoints
    manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    config.start_epoch = 1
    manage_recovery.additional_training_done()
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    # Copy checkpoint to make it seem like training has happened
    stored_checkpoint = create_checkpoint_path(
        path=full_ml_test_data_path("checkpoints"), epoch=1)
    expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder,
                                                 epoch=2)
    shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint))

    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()

    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 2
    assert checkpoint_and_paths[0].epoch == 1
    assert checkpoint_and_paths[0].checkpoint_paths == [
        create_checkpoint_path(path=config.checkpoint_folder /
                               DEFAULT_RUN_RECOVERY_ID.split(":")[1],
                               epoch=1)
    ]
    assert checkpoint_and_paths[1].epoch == 2
    assert checkpoint_and_paths[1].checkpoint_paths == [
        create_checkpoint_path(path=config.checkpoint_folder, epoch=2)
    ]

    # This epoch does not exist
    config.epochs_to_test = [3]
    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
    assert checkpoint_and_paths is None
Example #11
0
def test_copied_properties() -> None:
    config = ModelConfigBase(should_validate=False)
    # This field lives in DatasetParams
    config.azure_dataset_id = "foo"
    # This field lives in WorkflowParams
    config.number_of_cross_validation_splits = 5
    assert config.perform_cross_validation
    container = InnerEyeContainer(config)
    assert container.azure_dataset_id == "foo"
    assert container.perform_cross_validation
def test_download_recovery_checkpoints_from_ensemble_run(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)

    run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert "has child runs" in str(ex)
def test_download_recovery_single_run(test_output_dirs: OutputFolderForTests,
                                      runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
    run_recovery = RunRecovery.download_all_checkpoints_from_run(config, run)

    # This fails if there is no recovery checkpoint
    check_single_checkpoint(run_recovery.get_recovery_checkpoint_paths())
    check_single_checkpoint(run_recovery.get_best_checkpoint_paths())
Example #14
0
def test_config_non_overridable_params() -> None:
    """
    Check error raised if attempt to override non overridable configs
    """
    non_overridable_params = {k: v.default for k, v in ModelConfigBase.params().items()
                              if k not in ModelConfigBase.get_overridable_parameters()}
    with pytest.raises(ValueError) as ex:
        ModelConfigBase(
            should_validate=False,
            **non_overridable_params
        )
        assert "The following parameters cannot be overriden" in ex.value.args[0]
def test_download_checkpoints_from_hyperdrive_child_runs(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)
    hyperdrive_run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs(hyperdrive_run)
    expected_checkpoints = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i)
                            / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)]
    checkpoint_paths = checkpoint_handler.get_best_checkpoints()
    assert checkpoint_paths
    assert len(checkpoint_paths) == 2
    assert set(expected_checkpoints) == set(checkpoint_paths)
def test_get_best_checkpoint_single_run(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    # We have not set a run_recovery, nor have we trained, so this should fail to get a checkpoint
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_best_checkpoint()
        assert "no run recovery object provided and no training has been done in this run" in ex.value.args[
            0]

    run_recovery_id = get_most_recent_run_id(
        fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)

    # We have set a run_recovery_id now, so this should work: Should download all checkpoints that are available
    # in the run, into a subfolder of the checkpoint folder
    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \
                          / f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}"
    checkpoint_paths = checkpoint_handler.get_best_checkpoint()
    assert checkpoint_paths
    assert len(checkpoint_paths) == 1
    assert expected_checkpoint == checkpoint_paths[0]

    # From now on, the checkpoint handler will think that the run was started from epoch 1. We should pick up
    # the best checkpoint from the current run, or from the run recovery if the best checkpoint is there
    # and so no checkpoints have been written in the resumed run.
    checkpoint_handler.additional_training_done()
    # go back to non ensemble run recovery
    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()

    config.start_epoch = 1
    # There is no checkpoint in the current run - use the one from run_recovery
    checkpoint_paths = checkpoint_handler.get_best_checkpoint()
    expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \
                          / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    assert checkpoint_paths
    assert len(checkpoint_paths) == 1
    assert checkpoint_paths[0] == expected_checkpoint

    # Copy over checkpoints to make it look like training has happened and a better checkpoint written
    expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
    expected_checkpoint.touch()
    checkpoint_paths = checkpoint_handler.get_best_checkpoint()
    assert checkpoint_paths
    assert len(checkpoint_paths) == 1
    assert expected_checkpoint == checkpoint_paths[0]
Example #17
0
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests,
                                             runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID))
    # recover child runs separately also to test hyperdrive child run recovery functionality
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    for child in child_runs:
        expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file]
        run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child)
        assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)])
        assert all([expected_file.exists() for expected_file in expected_files])
Example #18
0
def test_innereye_container_init() -> None:
    """
    Test if the constructor of the InnerEye container copies attributes as expected.
    """
    # The constructor should copy all fields that belong to either WorkflowParams or DatasetParams from the
    # config object to the container.
    for (attrib, type_) in [("weights_url", WorkflowParams),
                            ("extra_dataset_mountpoints", DatasetParams)]:
        config = ModelConfigBase(should_validate=False)
        assert hasattr(type_, attrib)
        assert hasattr(config, attrib)
        setattr(config, attrib, ["foo"])
        container = InnerEyeContainer(config)
        assert getattr(container, attrib) == ["foo"]
def test_get_recovery_path_train_single_run(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)

    run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)

    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()

    # Run recovery with start epoch provided should succeed
    expected_path = get_recovery_checkpoint_path(path=config.checkpoint_folder)
    assert checkpoint_handler.get_recovery_or_checkpoint_path_train() == expected_path
def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderForTests,
                                                runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)

    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(config, run)
    other_runs_folder = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME
    assert other_runs_folder.is_dir()
    for child in ["0", "1"]:
        assert (other_runs_folder / child).is_dir(), "Child run folder does not exist"
    for checkpoint in run_recovery.get_best_checkpoint_paths():
        assert checkpoint.is_file(), f"File {checkpoint} does not exist"
Example #21
0
 def __init__(self,
              project_root: Path,
              yaml_config_file: Path,
              post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
              model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
              command_line_args: Optional[List[str]] = None):
     self.project_root = project_root
     self.yaml_config_file = yaml_config_file
     self.post_cross_validation_hook = post_cross_validation_hook
     self.model_deployment_hook = model_deployment_hook
     self.command_line_args = command_line_args
     # model_config and azure_config are placeholders for now, and are set properly when command line args are
     # parsed.
     self.model_config: ModelConfigBase = ModelConfigBase(azure_dataset_id="")
     self.azure_config: AzureConfig = AzureConfig()
Example #22
0
def test_download_azureml_dataset(
        test_output_dirs: OutputFolderForTests) -> None:
    dataset_name = "test-dataset"
    config = ModelConfigBase(should_validate=False)
    azure_config = get_default_azure_config()
    runner = MLRunner(config, azure_config)
    runner.project_root = test_output_dirs.root_dir

    # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
    with pytest.raises(ValueError):
        runner.mount_or_download_dataset()

    # Pointing the model to a dataset folder that does not exist should raise an Exception
    fake_folder = runner.project_root / "foo"
    runner.model_config.local_dataset = fake_folder
    with pytest.raises(FileNotFoundError):
        runner.mount_or_download_dataset()

    # If the local dataset folder exists, mount_or_download_dataset should not do anything.
    fake_folder.mkdir()
    local_dataset = runner.mount_or_download_dataset()
    assert local_dataset == fake_folder

    # Pointing the model to a dataset in Azure should trigger a download
    runner.model_config.local_dataset = None
    runner.model_config.azure_dataset_id = dataset_name
    with logging_section("Starting download"):
        result_path = runner.mount_or_download_dataset()
    # Download goes into <project_root> / "datasets" / "test_dataset"
    expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
    assert result_path == expected_path
    assert result_path.is_dir()
    dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
    assert dataset_csv.is_file()
    # Check that each individual file in the dataset is present
    for folder in [1, *range(10, 20)]:
        sub_folder = result_path / str(folder)
        sub_folder.is_dir()
        for file in [
                "ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"
        ]:
            f = (sub_folder / file).with_suffix(".nii.gz")
            assert f.is_file()
def test_get_child_paths(is_ensemble: bool, extra_code_directory: str) -> None:
    checkpoints = checkpoint_paths * 2 if is_ensemble else checkpoint_paths
    path_to_root = tests_root_directory().parent
    azure_config = AzureConfig(extra_code_directory=extra_code_directory)
    fake_model = ModelConfigBase(azure_dataset_id="fake_dataset_id")
    ml_runner = MLRunner(model_config=fake_model, azure_config=azure_config, project_root=path_to_root)
    child_paths = ml_runner.get_child_paths(checkpoints)
    assert fixed_paths.ENVIRONMENT_YAML_FILE_NAME in child_paths
    assert fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME in child_paths
    assert str(Path("InnerEye/ML/runner.py")) in child_paths
    assert str(Path("InnerEye/ML/model_testing.py")) in child_paths
    assert str(Path("InnerEye/Common/fixed_paths.py")) in child_paths
    assert str(Path("InnerEye/Common/common_util.py")) in child_paths
    trm = str(Path("TestsOutsidePackage/test_register_model.py"))
    if extra_code_directory:
        assert trm in child_paths
    else:
        assert trm not in child_paths
    assert all([x.relative_to(path_to_root) for x in checkpoints])
def test_get_recovery_path_train(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    assert checkpoint_handler.get_recovery_path_train() is None

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE

    # Set a weights_url to get checkpoint from
    checkpoint_handler.azure_config.run_recovery_id = ""
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."

    # Set a local_weights_path to get checkpoint from
    config.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    create_checkpoint_file(local_weights_path)
    config.local_weights_path = local_weights_path
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
def test_download_model_from_ensemble_run(test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)

    # No checkpoint handling options set.
    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                        project_root=test_output_dirs.root_dir)
    model_id = get_most_recent_model_id(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)

    # Set a run recovery object - non ensemble
    checkpoint_handler.container.model_id = model_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()
    assert checkpoint_handler.trained_weights_paths

    expected_model_root = config.checkpoint_folder / MODEL_WEIGHTS_DIR_NAME / FINAL_ENSEMBLE_MODEL_FOLDER
    model_inference_config = read_model_inference_config(expected_model_root / MODEL_INFERENCE_JSON_FILE_NAME)
    expected_paths = [expected_model_root / x for x in model_inference_config.checkpoint_paths]

    assert len(checkpoint_handler.trained_weights_paths) == len(expected_paths)
    assert set(checkpoint_handler.trained_weights_paths) == set(expected_paths)
    for path in expected_paths:
        assert path.is_file()
def test_get_recovery_path_train_single_run(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    run_recovery_id = get_most_recent_run_id(
        fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)

    checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
    checkpoint_handler.download_recovery_checkpoints_or_weights()

    # We have not set a start_epoch but we are trying to use run_recovery, this should fail
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert "Run recovery set, but start epoch is 0" in ex.value.args[0]

    # Run recovery with start epoch provided should succeed
    config.start_epoch = 20
    expected_path = create_recovery_checkpoint_path(
        path=config.checkpoint_folder / run_recovery_id.split(":")[1])
    assert checkpoint_handler.get_recovery_path_train() == expected_path
def test_get_local_weights_path_or_download(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    manage_recovery = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    # If the model has neither local_weights_path or weights_url set, should fail.
    with pytest.raises(ValueError) as ex:
        manage_recovery.get_local_weights_path_or_download()
        assert "neither local_weights_path nor weights_url is set in the model config" in ex.value.args[
            0]

    # If local_weights_path folder exists, get_local_weights_path_or_download should not do anything.
    local_weights_path = manage_recovery.project_root / "exist.pth"
    local_weights_path.touch()
    manage_recovery.model_config.local_weights_path = local_weights_path
    returned_weights_path = manage_recovery.get_local_weights_path_or_download(
    )
    assert local_weights_path == returned_weights_path

    # Pointing the model to a URL should trigger a download
    config.local_weights_path = None
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    downloaded_weights = manage_recovery.get_local_weights_path_or_download()
    # Download goes into <project_root> / "modelweights" / "resnet18-5c106cde.pth"
    expected_path = manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
                    os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
    assert downloaded_weights
    assert downloaded_weights.is_file()
    assert expected_path == downloaded_weights

    # try again, should not re-download
    modified_time = downloaded_weights.stat().st_mtime
    downloaded_weights_new = manage_recovery.get_local_weights_path_or_download(
    )
    assert downloaded_weights_new
    assert downloaded_weights_new.stat().st_mtime == modified_time
Example #28
0
def test_config_with_typo() -> None:
    with pytest.raises(ValueError) as ex:
        ModelConfigBase(num_epochsi=100)
    assert "The following parameters do not exist: ['num_epochsi']" in ex.value.args[
        0]
Example #29
0
def test_get_total_number_of_cross_validation_runs() -> None:
    config = ModelConfigBase(should_validate=False)
    config.number_of_cross_validation_splits = 2
    assert config.perform_cross_validation
    assert config.get_total_number_of_cross_validation_runs(
    ) == config.number_of_cross_validation_splits
def test_get_recovery_path_train(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    assert checkpoint_handler.get_recovery_path_train() is None

    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()

    # We have not set a start_epoch but we are trying to use run_recovery, this should fail
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert "Run recovery set, but start epoch is 0" in ex.value.args[0]

    # Run recovery with start epoch provided should succeed
    config.start_epoch = 20
    expected_path = create_checkpoint_path(
        path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1],
        epoch=config.start_epoch)
    assert checkpoint_handler.get_recovery_path_train() == expected_path

    # set an ensemble run as recovery - not supported
    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert "Found more than one checkpoint for epoch" in ex.value.args[0]

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE

    # Set a weights_url to get checkpoint from
    checkpoint_handler.azure_config.run_recovery_id = ""
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."

    # Set a local_weights_path to get checkpoint from
    config.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    stored_checkpoint = create_checkpoint_path(
        full_ml_test_data_path("checkpoints"), epoch=1)
    shutil.copyfile(str(stored_checkpoint), local_weights_path)
    config.local_weights_path = local_weights_path
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."