def test_get_checkpoints_to_test(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    manage_recovery = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided,
    # so the local weights should be used ignoring any epochs to test
    config.epochs_to_test = [1, 2]
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    stored_checkpoint = create_checkpoint_path(
        full_ml_test_data_path("checkpoints"), epoch=1)
    shutil.copyfile(str(stored_checkpoint), local_weights_path)
    config.local_weights_path = local_weights_path
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 1
    assert checkpoint_and_paths[0].epoch == 0
    assert checkpoint_and_paths[0].checkpoint_paths == [
        manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
    ]

    # Now set a run recovery object and set the start epoch to 1, so we get one epoch from
    # run recovery and one from the training checkpoints
    manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    config.start_epoch = 1
    manage_recovery.additional_training_done()
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    # Copy checkpoint to make it seem like training has happened
    stored_checkpoint = create_checkpoint_path(
        path=full_ml_test_data_path("checkpoints"), epoch=1)
    expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder,
                                                 epoch=2)
    shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint))

    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()

    assert checkpoint_and_paths
    assert len(checkpoint_and_paths) == 2
    assert checkpoint_and_paths[0].epoch == 1
    assert checkpoint_and_paths[0].checkpoint_paths == [
        create_checkpoint_path(path=config.checkpoint_folder /
                               DEFAULT_RUN_RECOVERY_ID.split(":")[1],
                               epoch=1)
    ]
    assert checkpoint_and_paths[1].epoch == 2
    assert checkpoint_and_paths[1].checkpoint_paths == [
        create_checkpoint_path(path=config.checkpoint_folder, epoch=2)
    ]

    # This epoch does not exist
    config.epochs_to_test = [3]
    checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
    assert checkpoint_and_paths is None
Beispiel #2
0
 def get_path_to_checkpoint(self, epoch: int) -> Path:
     """
     Returns full path to a checkpoint given an epoch
     :param epoch: the epoch number
     :return: path to a checkpoint given an epoch
     """
     return create_checkpoint_path(self.checkpoint_folder, epoch=epoch)
Beispiel #3
0
 def get_checkpoint_paths(
         self,
         epoch: int,
         for_mean_teacher_model: bool = False) -> List[Path]:
     return [
         create_checkpoint_path(x, epoch, for_mean_teacher_model)
         for x in self.checkpoints_roots
     ]
Beispiel #4
0
 def get_path_to_checkpoint(self, epoch: int) -> Path:
     """
     Returns full path to a checkpoint given an epoch
     :param epoch: the epoch number
     :param for_mean_teacher_model: if True looking returns path to the mean teacher checkpoint. Else returns the
     path to the (main / student) model checkpoint.
     :return: path to a checkpoint given an epoch
     """
     return create_checkpoint_path(
         path=fixed_paths.repository_root_directory() /
         self.checkpoint_folder,
         epoch=epoch)
Beispiel #5
0
def test_create_inference_pipeline(config: ModelConfigBase,
                                   checkpoint_folder: str,
                                   inference_type: type,
                                   ensemble_type: type,
                                   test_output_dirs: OutputFolderForTests) -> None:
    config.set_output_to(test_output_dirs.root_dir)
    # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
    stored_checkpoints = full_ml_test_data_path(checkpoint_folder)
    shutil.copytree(str(stored_checkpoints), str(config.checkpoint_folder))

    checkpoint_path = create_checkpoint_path(stored_checkpoints, epoch=1)

    assert isinstance(create_inference_pipeline(config, [checkpoint_path]), inference_type)
    assert isinstance(create_inference_pipeline(config, [checkpoint_path] * 2), ensemble_type)
 def get_checkpoint_paths(self, epoch: int) -> List[Path]:
     return [create_checkpoint_path(x, epoch) for x in self.checkpoints_roots]
Beispiel #7
0
def test_recover_testing_from_run_recovery(
        mean_teacher_model: bool,
        test_output_dirs: OutputFolderForTests) -> None:
    """
    Checks that inference results are the same whether from a checkpoint in the same run, from a run recovery or from a
    local_weights_path param.
    """
    # Train for 4 epochs
    config = DummyClassification()
    if mean_teacher_model:
        config.mean_teacher_alpha = 0.999
    config.set_output_to(test_output_dirs.root_dir / "original")
    os.makedirs(str(config.outputs_folder))
    config.save_start_epoch = 2
    config.save_step_epochs = 2

    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)
    train_results = model_train(config, checkpoint_handler=checkpoint_handler)
    assert len(train_results.learning_rates_per_epoch) == config.num_epochs

    # Run inference on this
    test_results = model_test(config=config,
                              data_split=ModelExecutionMode.TEST,
                              checkpoint_handler=checkpoint_handler)
    assert isinstance(test_results, InferenceMetricsForClassification)
    assert list(test_results.epochs.keys()) == [config.num_epochs]

    # Mimic using a run recovery and see if it is the same
    config_run_recovery = DummyClassification()
    if mean_teacher_model:
        config_run_recovery.mean_teacher_alpha = 0.999
    config_run_recovery.set_output_to(test_output_dirs.root_dir /
                                      "run_recovery")
    os.makedirs(str(config_run_recovery.outputs_folder))

    checkpoint_handler_run_recovery = get_default_checkpoint_handler(
        model_config=config_run_recovery,
        project_root=test_output_dirs.root_dir)
    # make it seem like run recovery objects have been downloaded
    checkpoint_root = config_run_recovery.checkpoint_folder / "recovered"
    shutil.copytree(str(config.checkpoint_folder), str(checkpoint_root))
    checkpoint_handler_run_recovery.run_recovery = RunRecovery(
        [checkpoint_root])
    test_results_run_recovery = model_test(
        config_run_recovery,
        data_split=ModelExecutionMode.TEST,
        checkpoint_handler=checkpoint_handler_run_recovery)
    assert isinstance(test_results_run_recovery,
                      InferenceMetricsForClassification)
    assert list(test_results_run_recovery.epochs.keys()) == [config.num_epochs]
    assert test_results.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value] == \
           test_results_run_recovery.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value]

    # Run inference with the local checkpoints
    config_local_weights = DummyClassification()
    if mean_teacher_model:
        config_local_weights.mean_teacher_alpha = 0.999
    config_local_weights.set_output_to(test_output_dirs.root_dir /
                                       "local_weights_path")
    os.makedirs(str(config_local_weights.outputs_folder))

    local_weights_path = test_output_dirs.root_dir / "local_weights_file.pth"
    shutil.copyfile(
        str(
            create_checkpoint_path(config.checkpoint_folder,
                                   epoch=config.num_epochs)),
        local_weights_path)
    config_local_weights.local_weights_path = local_weights_path

    checkpoint_handler_local_weights = get_default_checkpoint_handler(
        model_config=config_local_weights,
        project_root=test_output_dirs.root_dir)
    checkpoint_handler_local_weights.discover_and_download_checkpoints_from_previous_runs(
    )
    test_results_local_weights = model_test(
        config_local_weights,
        data_split=ModelExecutionMode.TEST,
        checkpoint_handler=checkpoint_handler_local_weights)
    assert isinstance(test_results_local_weights,
                      InferenceMetricsForClassification)
    assert list(test_results_local_weights.epochs.keys()) == [0]
    assert test_results.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value] == \
           test_results_local_weights.epochs[0].values()[MetricType.CROSS_ENTROPY.value]
def test_get_recovery_path_train(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    assert checkpoint_handler.get_recovery_path_train() is None

    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()

    # We have not set a start_epoch but we are trying to use run_recovery, this should fail
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert "Run recovery set, but start epoch is 0" in ex.value.args[0]

    # Run recovery with start epoch provided should succeed
    config.start_epoch = 20
    expected_path = create_checkpoint_path(
        path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1],
        epoch=config.start_epoch)
    assert checkpoint_handler.get_recovery_path_train() == expected_path

    # set an ensemble run as recovery - not supported
    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert "Found more than one checkpoint for epoch" in ex.value.args[0]

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE

    # Set a weights_url to get checkpoint from
    checkpoint_handler.azure_config.run_recovery_id = ""
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."

    # Set a local_weights_path to get checkpoint from
    config.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    stored_checkpoint = create_checkpoint_path(
        full_ml_test_data_path("checkpoints"), epoch=1)
    shutil.copyfile(str(stored_checkpoint), local_weights_path)
    config.local_weights_path = local_weights_path
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
    config.start_epoch = 0
    assert checkpoint_handler.get_recovery_path_train() == expected_path
    # Can't resume training from an external checkpoint
    config.start_epoch = 20
    with pytest.raises(ValueError) as ex:
        checkpoint_handler.get_recovery_path_train()
        assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
def test_discover_and_download_checkpoints_from_previous_runs(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()

    # No checkpoint handling options set.
    checkpoint_handler = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert not checkpoint_handler.run_recovery
    assert not checkpoint_handler.local_weights_path

    # Set a run recovery object - non ensemble
    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()

    expected_checkpoint_root = config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(
        ":")[1]
    expected_paths = [
        create_checkpoint_path(path=expected_checkpoint_root, epoch=epoch)
        for epoch in [1, 2, 3, 4, 20]
    ]
    assert checkpoint_handler.run_recovery
    assert checkpoint_handler.run_recovery.checkpoints_roots == [
        expected_checkpoint_root
    ]
    for path in expected_paths:
        assert path.is_file()

    # Set a run recovery object - ensemble
    checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()

    expected_checkpoint_roots = [
        config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i)
        for i in range(3)
    ]
    expected_path_lists = [[
        create_checkpoint_path(path=expected_checkpoint_root, epoch=epoch)
        for epoch in [1, 2]
    ] for expected_checkpoint_root in expected_checkpoint_roots]
    assert set(checkpoint_handler.run_recovery.checkpoints_roots) == set(
        expected_checkpoint_roots)
    for path_list in expected_path_lists:
        for path in path_list:
            assert path.is_file()

    # weights from local_weights_path and weights_url will be modified if needed and stored at this location
    expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE

    # Set a weights_path
    checkpoint_handler.azure_config.run_recovery_id = ""
    config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
    assert checkpoint_handler.local_weights_path.is_file()

    # set a local_weights_path
    config.weights_url = ""
    local_weights_path = test_output_dirs.root_dir / "exist.pth"
    stored_checkpoint = create_checkpoint_path(
        path=full_ml_test_data_path("checkpoints"), epoch=1)
    shutil.copyfile(str(stored_checkpoint), local_weights_path)
    config.local_weights_path = local_weights_path
    checkpoint_handler.discover_and_download_checkpoints_from_previous_runs()
    assert checkpoint_handler.local_weights_path == expected_path
def test_get_checkpoint_from_epoch(
        test_output_dirs: OutputFolderForTests) -> None:
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(test_output_dirs.root_dir)
    config.outputs_folder.mkdir()
    manage_recovery = get_default_checkpoint_handler(
        model_config=config, project_root=test_output_dirs.root_dir)

    # We have not set a run_recovery, nor have we trained, so this should fail to get a checkpoint
    with pytest.raises(ValueError) as ex:
        manage_recovery.get_checkpoint_from_epoch(1)
        assert "no run recovery object provided and no training has been done in this run" in ex.value.args[
            0]

    # We have set a run_recovery_id now, so this should work
    manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    expected_checkpoint = create_checkpoint_path(
        path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1],
        epoch=1)
    checkpoint = manage_recovery.get_checkpoint_from_epoch(1)
    assert checkpoint
    assert len(checkpoint.checkpoint_paths) == 1
    assert expected_checkpoint == checkpoint.checkpoint_paths[0]
    assert checkpoint.epoch == 1

    # ensemble run recovery
    manage_recovery.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()
    expected_checkpoints = [
        create_checkpoint_path(path=config.checkpoint_folder /
                               OTHER_RUNS_SUBDIR_NAME / str(i),
                               epoch=1) for i in range(3)
    ]
    checkpoint = manage_recovery.get_checkpoint_from_epoch(1)
    assert checkpoint
    assert len(checkpoint.checkpoint_paths) == 3
    assert set(expected_checkpoints) == set(checkpoint.checkpoint_paths)
    assert checkpoint.epoch == 1

    # From now on, the checkpoint handler will think that the run was started from epoch 1, i.e. we should use the
    # run recovery checkpoint for epoch 1 and the training run checkpoint for epoch 2
    manage_recovery.additional_training_done()
    # go back to non ensemble run recovery
    manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID
    manage_recovery.discover_and_download_checkpoints_from_previous_runs()

    config.start_epoch = 1
    # We haven't actually done a training run ,so the checkpoint for epoch 2 is missing - and we should not use the one
    # from run recovery
    assert manage_recovery.get_checkpoint_from_epoch(2) is None

    # Should work for epoch 1
    checkpoint = manage_recovery.get_checkpoint_from_epoch(1)
    expected_checkpoint = create_checkpoint_path(
        path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1],
        epoch=1)
    assert checkpoint
    assert len(checkpoint.checkpoint_paths) == 1
    assert checkpoint.checkpoint_paths[0] == expected_checkpoint
    assert checkpoint.epoch == 1

    # Copy over checkpoints to make it look like training has happened
    stored_checkpoint = create_checkpoint_path(
        path=full_ml_test_data_path("checkpoints"), epoch=1)
    expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder,
                                                 epoch=2)
    shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint))

    # Should now work for epoch 2
    checkpoint = manage_recovery.get_checkpoint_from_epoch(2)
    assert checkpoint
    assert len(checkpoint.checkpoint_paths) == 1
    assert expected_checkpoint == checkpoint.checkpoint_paths[0]
    assert checkpoint.epoch == 2