def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run: Run) -> None: """ Downloads the best checkpoints from all child runs of a Hyperdrive parent runs. This is used to gather results for ensemble creation. """ self.run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(self.model_config, hyperdrive_parent_run) # Check paths are good, just in case for path in self.run_recovery.checkpoints_roots: if not path.is_dir(): raise NotADirectoryError(f"Does not exist or is not a directory: {path}")
def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderForTests, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN) run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(config, run) other_runs_folder = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME assert other_runs_folder.is_dir() for child in ["0", "1"]: assert (other_runs_folder / child).is_dir(), "Child run folder does not exist" for checkpoint in run_recovery.get_best_checkpoint_paths(): assert checkpoint.is_file(), f"File {checkpoint} does not exist"