def test_config_comparison() -> PlotCrossValidationConfig: return PlotCrossValidationConfig( run_recovery_id=get_most_recent_run_id() + "_0", epoch=1, comparison_run_recovery_ids=[get_most_recent_run_id() + "_1"], model_category=ModelCategory.Segmentation )
def test_get_checkpoints_to_test_single_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # Now set a run recovery object and set the start epoch to 1, so we get one epoch from # run recovery and one from the training checkpoints checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.additional_training_done() checkpoint_handler.download_recovery_checkpoints_or_weights() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX # Copy checkpoint to make it seem like training has happened expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX expected_checkpoint.touch() checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0] == expected_checkpoint
def test_download_checkpoints_from_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # Set a run recovery object - non ensemble checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() assert checkpoint_handler.run_recovery expected_checkpoint_root = config.checkpoint_folder / run_recovery_id.split( ":")[1] expected_paths = [ create_recovery_checkpoint_path(path=expected_checkpoint_root), expected_checkpoint_root / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX ] assert checkpoint_handler.run_recovery.checkpoints_roots == [ expected_checkpoint_root ] for path in expected_paths: assert path.is_file()
def test_is_completed_single_run() -> None: """ Test if we can correctly check run status for a single run. :return: """ logging_to_stdout() workspace = get_default_workspace() get_run_and_check(get_most_recent_run_id(), True, workspace)
def test_is_completed_ensemble_run() -> None: """ Test if we can correctly check run status and status of child runs for an ensemble run. :return: """ logging_to_stdout() workspace = get_default_workspace() run_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) get_run_and_check(run_id, True, workspace)
def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None: azure_config = get_default_azure_config() comparison_name = "DefaultName" comparison_path = get_most_recent_run_id() + \ f"/{DEFAULT_AML_UPLOAD_DIR}/{BEST_EPOCH_FOLDER_NAME}/{ModelExecutionMode.TEST.value}" baselines = get_comparison_baselines(test_output_dirs.root_dir, azure_config, [(comparison_name, comparison_path)]) assert len(baselines) == 1 assert baselines[0].name == comparison_name
def test_download_recovery_checkpoints_from_ensemble_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id with pytest.raises(ValueError) as ex: checkpoint_handler.download_recovery_checkpoints_or_weights() assert "has child runs" in str(ex)
def test_get_best_checkpoint_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # We have not set a run_recovery, nor have we trained, so this should fail to get a checkpoint with pytest.raises(ValueError) as ex: checkpoint_handler.get_best_checkpoint() assert "no run recovery object provided and no training has been done in this run" in ex.value.args[ 0] run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) # We have set a run_recovery_id now, so this should work: Should download all checkpoints that are available # in the run, into a subfolder of the checkpoint folder checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \ / f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}" checkpoint_paths = checkpoint_handler.get_best_checkpoint() assert checkpoint_paths assert len(checkpoint_paths) == 1 assert expected_checkpoint == checkpoint_paths[0] # From now on, the checkpoint handler will think that the run was started from epoch 1. We should pick up # the best checkpoint from the current run, or from the run recovery if the best checkpoint is there # and so no checkpoints have been written in the resumed run. checkpoint_handler.additional_training_done() # go back to non ensemble run recovery checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() config.start_epoch = 1 # There is no checkpoint in the current run - use the one from run_recovery checkpoint_paths = checkpoint_handler.get_best_checkpoint() expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \ / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX assert checkpoint_paths assert len(checkpoint_paths) == 1 assert checkpoint_paths[0] == expected_checkpoint # Copy over checkpoints to make it look like training has happened and a better checkpoint written expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX expected_checkpoint.touch() checkpoint_paths = checkpoint_handler.get_best_checkpoint() assert checkpoint_paths assert len(checkpoint_paths) == 1 assert expected_checkpoint == checkpoint_paths[0]
def test_get_recovery_path_train_single_run(test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() # Run recovery with start epoch provided should succeed expected_path = get_recovery_checkpoint_path(path=config.checkpoint_folder) assert checkpoint_handler.get_recovery_or_checkpoint_path_train() == expected_path
def test_add_comparison_data() -> None: fallback_run = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) crossval_config = PlotCrossValidationConfig( run_recovery_id=fallback_run + "_0", epoch=1, comparison_run_recovery_ids=[fallback_run + "_1"], model_category=ModelCategory.Segmentation) crossval_config.epoch = 2 metrics_df, root_folder = download_metrics(crossval_config) initial_metrics = pd.concat(list(metrics_df.values())) all_metrics, focus_splits = add_comparison_data(crossval_config, initial_metrics) focus_split = crossval_config.run_recovery_id comparison_split = crossval_config.comparison_run_recovery_ids[0] assert focus_splits == [focus_split] assert set(all_metrics.split) == {focus_split, comparison_split}
def test_get_recovery_path_train_single_run( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) run_recovery_id = get_most_recent_run_id( fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) checkpoint_handler.azure_config.run_recovery_id = run_recovery_id checkpoint_handler.download_recovery_checkpoints_or_weights() # We have not set a start_epoch but we are trying to use run_recovery, this should fail with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Run recovery set, but start epoch is 0" in ex.value.args[0] # Run recovery with start epoch provided should succeed config.start_epoch = 20 expected_path = create_recovery_checkpoint_path( path=config.checkpoint_folder / run_recovery_id.split(":")[1]) assert checkpoint_handler.get_recovery_path_train() == expected_path
def test_config() -> PlotCrossValidationConfig: return PlotCrossValidationConfig(run_recovery_id=get_most_recent_run_id(), epoch=1, model_category=ModelCategory.Segmentation)