コード例 #1
0
def test_download_checkpoints(test_output_dirs: OutputFolderForTests, is_ensemble: bool,
                              runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
    # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there.
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)

    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID
    run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config)
    run_to_recover = fetch_run(workspace=runner_config.get_workspace(), run_recovery_id=runner_config.run_recovery_id)
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    if is_ensemble:
        child_runs = fetch_child_runs(run_to_recover)
        expected_files = [config.checkpoint_folder
                          / OTHER_RUNS_SUBDIR_NAME
                          / str(x.get_tags()['cross_validation_split_index']) / expected_checkpoint_file
                          for x in child_runs]
    else:
        expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file]

    checkpoint_paths = run_recovery.get_checkpoint_paths(1)
    if is_ensemble:
        assert len(run_recovery.checkpoints_roots) == len(expected_files)
        assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots])
        assert len(checkpoint_paths) == len(expected_files)
        assert all([x in expected_files for x in checkpoint_paths])
    else:
        assert len(checkpoint_paths) == 1
        assert checkpoint_paths[0] == expected_files[0]

    assert all([expected_file.exists() for expected_file in expected_files])
コード例 #2
0
    def download_checkpoints_from_recovery_run(azure_config: AzureConfig,
                                               config: DeepLearningConfig,
                                               run_context: Optional[Run] = None) -> RunRecovery:
        """
        Downloads checkpoints of run corresponding to the run_recovery_id in azure_config, and any
        checkpoints of the child runs if they exist.

        :param azure_config: Azure related configs.
        :param config: Model related configs.
        :param run_context: Context of the current run (will be used to find the target AML workspace)
        :return:RunRecovery
        """
        run_context = run_context or RUN_CONTEXT
        workspace = azure_config.get_workspace()

        # Find the run to recover in AML workspace
        if not azure_config.run_recovery_id:
            raise ValueError("A valid run_recovery_id is required to download recovery checkpoints, found None")

        run_to_recover = fetch_run(workspace, azure_config.run_recovery_id.strip())
        # Handle recovery of a HyperDrive cross validation run (from within a successor HyperDrive run,
        # not in ensemble creation). In this case, run_recovery_id refers to the parent prior run, so we
        # need to set run_to_recover to the child of that run whose split index is the same as that of
        # the current (child) run.
        if is_cross_validation_child_run(run_context):
            run_to_recover = next(x for x in fetch_child_runs(run_to_recover) if
                                  get_cross_validation_split_index(x) == get_cross_validation_split_index(run_context))

        return RunRecovery.download_checkpoints_from_run(config, run_to_recover)
コード例 #3
0
def download_all_checkpoints_from_run(config: OutputParams, run: Run,
                                      subfolder: Optional[str] = None,
                                      only_return_path: bool = False) -> RunRecovery:
    """
    Downloads all checkpoints of the provided run inside the checkpoints folder.
    :param config: Model related configs.
    :param run: Run whose checkpoints should be recovered
    :param subfolder: optional subfolder name, if provided the checkpoints will be downloaded to
    CHECKPOINT_FOLDER / subfolder. If None, the checkpoint are downloaded to CHECKPOINT_FOLDER of the current run.
    :param: only_return_path: if True, return a RunRecovery object with the path to the checkpoint without actually
    downloading the checkpoints. This is useful to avoid duplicating checkpoint download when running on multiple
    nodes. If False, return the RunRecovery object and download the checkpoint to disk.
    :return: run recovery information
    """
    if fetch_child_runs(run):
        raise ValueError(f"AzureML run {run.id} has child runs, this method does not support those.")

    destination_folder = config.checkpoint_folder / subfolder if subfolder else config.checkpoint_folder

    if not only_return_path:
        download_run_outputs_by_prefix(
            blobs_prefix=Path(CHECKPOINT_FOLDER),
            destination=destination_folder,
            run=run
        )
    time.sleep(60)  # Needed because AML is not fast enough to download
    return RunRecovery(checkpoints_roots=[destination_folder])
コード例 #4
0
def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) -> RunRecovery:
    """
    Downloads the best checkpoints from all child runs of the provided Hyperdrive parent run.
    The checkpoints for the sibling runs will go into folder 'OTHER_RUNS/<cross_validation_split>'
    in the checkpoint folder. There is special treatment for the child run that is equal to the present AzureML
    run, its checkpoints will be read off the checkpoint folder as-is.
    :param config: Model related configs.
    :param run: The Hyperdrive parent run to download from.
    :return: run recovery information
    """
    child_runs: List[Run] = fetch_child_runs(run)
    if not child_runs:
        raise ValueError(f"AzureML run {run.id} does not have any child runs.")
    logging.info(f"Run {run.id} has {len(child_runs)} child runs: {', '.join(c.id for c in child_runs)}")
    tag_to_use = 'cross_validation_split_index'
    can_use_split_indices = tag_values_all_distinct(child_runs, tag_to_use)
    # download checkpoints for the child runs in the root of the parent
    child_runs_checkpoints_roots: List[Path] = []
    for child in child_runs:
        if child.id == RUN_CONTEXT.id:
            # We expect to find the file(s) we need in config.checkpoint_folder
            child_dst = config.checkpoint_folder
        else:
            subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number)
            child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir
            download_run_output_file(
                blob_path=Path(CHECKPOINT_FOLDER) / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
                destination=child_dst,
                run=child
            )
        child_runs_checkpoints_roots.append(child_dst)
    return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
コード例 #5
0
 def wait_until_cross_val_splits_are_ready_for_aggregation(self) -> bool:
     """
     Checks if all child runs (except the current run) of the current run's parent are completed or failed.
     If this is the case, then we can aggregate the results of the other runs before terminating this run.
     :return: whether we need to wait, i.e. whether some runs are still pending.
     """
     if (not self.model_config.is_offline_run) \
             and (azure_util.is_cross_validation_child_run(RUN_CONTEXT)):
         n_splits = self.model_config.get_total_number_of_cross_validation_runs(
         )
         child_runs = azure_util.fetch_child_runs(
             PARENT_RUN_CONTEXT,
             expected_number_cross_validation_splits=n_splits)
         pending_runs = [
             x.id for x in child_runs if (x.id != RUN_CONTEXT.id) and
             (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED])
         ]
         should_wait = len(pending_runs) > 0
         if should_wait:
             logging.info(
                 f"Waiting for sibling run(s) to finish: {pending_runs}")
         return should_wait
     else:
         raise NotImplementedError(
             "cross_val_splits_are_ready_for_aggregation is implemented for online "
             "cross validation runs only")
コード例 #6
0
 def are_sibling_runs_finished(self) -> bool:
     """
     Checks if all child runs (except the current run) of the current run's parent are completed or failed.
     :return: True if all sibling runs of the current run have finished (they either completed successfully,
     or failed). False if any of them is still pending (running or queued).
     """
     if (not self.model_config.is_offline_run) \
             and (azure_util.is_cross_validation_child_run(RUN_CONTEXT)):
         n_splits = self.model_config.get_total_number_of_cross_validation_runs(
         )
         child_runs = azure_util.fetch_child_runs(
             PARENT_RUN_CONTEXT,
             expected_number_cross_validation_splits=n_splits)
         pending_runs = [
             x.id for x in child_runs if (x.id != RUN_CONTEXT.id) and
             (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED])
         ]
         all_runs_finished = len(pending_runs) == 0
         if not all_runs_finished:
             logging.info(
                 f"Waiting for sibling run(s) to finish: {pending_runs}")
         return all_runs_finished
     else:
         raise NotImplementedError(
             "are_sibling_runs_finished only works for cross validation runs in AzureML."
         )
コード例 #7
0
 def download_checkpoints_from_run(
         azure_config: AzureConfig,
         config: ModelConfigBase,
         run: Run,
         output_subdir_name: Optional[str] = None) -> RunRecovery:
     """
     Downloads checkpoints of the provided run or, if applicable, its children.
     :param azure_config: Azure related configs.
     :param config: Model related configs.
     :param run: Run whose checkpoints should be recovered
     :return: run recovery information
     """
     child_runs: List[Run] = fetch_child_runs(run)
     logging.debug(f"Run has ID {run.id} and initial child runs are:")
     for child_run in child_runs:
         logging.debug(f"     {child_run.id}")
     checkpoint_subdir_name: Optional[str]
     if output_subdir_name:
         # From e.g. parent_dir/checkpoints we want parent_dir/output_subdir_name, to which we will
         # append split_index / checkpoints below to create child_dst.
         checkpoint_path = Path(config.checkpoint_folder)
         parent_path = checkpoint_path.parent
         checkpoint_subdir_name = checkpoint_path.name
         root_output_dir = parent_path / output_subdir_name
     else:
         root_output_dir = Path(config.checkpoint_folder) / run.id
         checkpoint_subdir_name = None
     # download checkpoints for the run
     download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER),
                               destination=root_output_dir,
                               run=run)
     if len(child_runs) > 0:
         tag_to_use = 'cross_validation_split_index'
         can_use_split_indices = tag_values_all_distinct(
             child_runs, tag_to_use)
         # download checkpoints for the child runs in the root of the parent
         child_runs_checkpoints_roots: List[Path] = []
         for child in child_runs:
             if child.id == RUN_CONTEXT.id:
                 # We expect to find the file(s) we need in config.checkpoint_folder
                 child_dst = Path(config.checkpoint_folder)
             else:
                 subdir = str(child.tags[tag_to_use]
                              if can_use_split_indices else child.number)
                 if checkpoint_subdir_name:
                     child_dst = root_output_dir / subdir / checkpoint_subdir_name
                 else:
                     child_dst = root_output_dir / subdir
                 download_outputs_from_run(
                     blobs_path=Path(CHECKPOINT_FOLDER),
                     destination=child_dst,
                     run=child)
             child_runs_checkpoints_roots.append(child_dst)
         return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
     else:
         return RunRecovery(checkpoints_roots=[root_output_dir])
コード例 #8
0
def get_first_child_run(azure_config: AzureConfig) -> Run:
    """
    Download first child run in order to download data
    :param azure_config:
    :return: first child run
    """
    if not azure_config.run_recovery_id:
        raise ValueError("azure_config.run_recovery_id is not provided.")
    hyperdrive_run = azure_config.fetch_run(azure_config.run_recovery_id)
    child_runs = fetch_child_runs(hyperdrive_run, status=RunStatus.COMPLETED)
    return child_runs[0]
コード例 #9
0
def test_is_cross_validation_child_run_ensemble_run() -> None:
    """
    Test that cross validation child runs are identified correctly.
    """
    # check for offline run
    assert not is_cross_validation_child_run(Run.get_context())
    # check for online runs
    run = get_most_recent_run(
        fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    assert not is_cross_validation_child_run(run)
    assert all(
        [is_cross_validation_child_run(x) for x in fetch_child_runs(run)])
コード例 #10
0
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests,
                                             runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID))
    # recover child runs separately also to test hyperdrive child run recovery functionality
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    for child in child_runs:
        expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file]
        run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child)
        assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)])
        assert all([expected_file.exists() for expected_file in expected_files])
コード例 #11
0
def test_get_cross_validation_split_index_ensemble_run() -> None:
    """
    Test that retrieved cross validation split index is as expected, for ensembles.
    """
    # check for offline run
    assert get_cross_validation_split_index(
        Run.get_context()) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    # check for online runs
    run = get_most_recent_run(
        fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
    assert get_cross_validation_split_index(
        run) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    assert all([
        get_cross_validation_split_index(x) >
        DEFAULT_CROSS_VALIDATION_SPLIT_INDEX for x in fetch_child_runs(run)
    ])
コード例 #12
0
def test_is_cross_validation_child_run(is_ensemble: bool,
                                       is_numeric: bool) -> None:
    """
    Test that cross validation child runs are identified correctly.
    """
    if is_ensemble:
        rid = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID_NUMERIC if is_numeric else DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    else:
        rid = DEFAULT_RUN_RECOVERY_ID_NUMERIC if is_numeric else DEFAULT_RUN_RECOVERY_ID
    run = fetch_run(workspace=get_default_workspace(), run_recovery_id=rid)
    # check for offline run
    assert not is_cross_validation_child_run(Run.get_context())
    # check for online runs
    assert not is_cross_validation_child_run(run)
    if is_ensemble:
        assert all(
            [is_cross_validation_child_run(x) for x in fetch_child_runs(run)])
コード例 #13
0
def test_get_cross_validation_split_index(is_ensemble: bool) -> None:
    """
    Test that retrieved cross validation split index is as expected, for single runs and ensembles.
    """
    run = fetch_run(workspace=get_default_workspace(),
                    run_recovery_id=DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
                    if is_ensemble else DEFAULT_RUN_RECOVERY_ID)
    # check for offline run
    assert get_cross_validation_split_index(
        Run.get_context()) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    # check for online runs
    assert get_cross_validation_split_index(
        run) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    if is_ensemble:
        assert all([
            get_cross_validation_split_index(x) >
            DEFAULT_CROSS_VALIDATION_SPLIT_INDEX for x in fetch_child_runs(run)
        ])
コード例 #14
0
 def download_checkpoints_from_run(config: DeepLearningConfig,
                                   run: Run) -> RunRecovery:
     """
     Downloads checkpoints of the provided run or, if applicable, its children.
     When downloading from a run that does not have sibling runs, a single folder inside the checkpoints folder
     will be created that contains the downloaded checkpoints.
     When downloading from a run that has sibling runs, the checkpoints for the sibling runs will go into
     folder 'OTHER_RUNS/<cross_validation_split>'
     :param config: Model related configs.
     :param run: Run whose checkpoints should be recovered
     :return: run recovery information
     """
     # TODO antonsc: Clarify how we handle the case of multiple checkpoint being downloaded.
     child_runs: List[Run] = fetch_child_runs(run)
     if child_runs:
         logging.info(f"Run has ID {run.id}, child runs: {', '.join(c.id for c in child_runs)}")
         tag_to_use = 'cross_validation_split_index'
         can_use_split_indices = tag_values_all_distinct(child_runs, tag_to_use)
         # download checkpoints for the child runs in the root of the parent
         child_runs_checkpoints_roots: List[Path] = []
         for child in child_runs:
             if child.id == RUN_CONTEXT.id:
                 # We expect to find the file(s) we need in config.checkpoint_folder
                 child_dst = config.checkpoint_folder
             else:
                 subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number)
                 child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir
                 download_outputs_from_run(
                     blobs_path=Path(CHECKPOINT_FOLDER),
                     destination=child_dst,
                     run=child
                 )
             child_runs_checkpoints_roots.append(child_dst)
         return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
     else:
         logging.info(f"Run with ID {run.id} has no child runs")
         root_output_dir = config.checkpoint_folder / run.id
         # download checkpoints for the run
         download_outputs_from_run(
             blobs_path=Path(CHECKPOINT_FOLDER),
             destination=root_output_dir,
             run=run
         )
         return RunRecovery(checkpoints_roots=[root_output_dir])
コード例 #15
0
    def download_all_checkpoints_from_run(config: DeepLearningConfig,
                                          run: Run) -> RunRecovery:
        """
        Downloads all checkpoints of the provided run: The best checkpoint and the recovery checkpoint.
        A single folder inside the checkpoints folder will be created that contains the downloaded checkpoints.
        :param config: Model related configs.
        :param run: Run whose checkpoints should be recovered
        :return: run recovery information
        """
        if fetch_child_runs(run):
            raise ValueError(
                f"AzureML run {run.id} has child runs, this method does not support those."
            )

        root_output_dir = config.checkpoint_folder / run.id
        download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER),
                                  destination=root_output_dir,
                                  run=run)
        return RunRecovery(checkpoints_roots=[root_output_dir])
コード例 #16
0
    def download_all_checkpoints_from_run(
            config: OutputParams,
            run: Run,
            subfolder: Optional[str] = None) -> RunRecovery:
        """
        Downloads all checkpoints of the provided run inside the checkpoints folder.
        :param config: Model related configs.
        :param run: Run whose checkpoints should be recovered
        :param subfolder: optional subfolder name, if provided the checkpoints will be downloaded to
        CHECKPOINT_FOLDER / subfolder. If None, the checkpoint are downloaded to CHECKPOINT_FOLDER of the current run.
        :return: run recovery information
        """
        if fetch_child_runs(run):
            raise ValueError(
                f"AzureML run {run.id} has child runs, this method does not support those."
            )

        destination_folder = config.checkpoint_folder / subfolder if subfolder else config.checkpoint_folder

        download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER),
                                  destination=destination_folder,
                                  run=run)
        time.sleep(60)  # Needed because AML is not fast enough to download
        return RunRecovery(checkpoints_roots=[destination_folder])
コード例 #17
0
def download_crossval_result_files(config: PlotCrossValidationConfig,
                                   run_recovery_id: Optional[str] = None,
                                   epoch: Optional[int] = None,
                                   download_to_folder: Optional[Path] = None,
                                   splits_to_evaluate: Optional[List[str]] = None) -> Tuple[List[RunResultFiles], Path]:
    """
    Given an AzureML run, downloads all files that are necessary for doing an analysis of cross validation runs.
    It will download the metrics.csv file for each dataset split (,Test, Val) and all of the run's children.
    When running in segmentation mode, it also downloads the dataset.csv and adds the institutionId and seriesId
    information for each subject found in the metrics files.
    :param config: PlotCrossValidationConfig
    :param run_recovery_id: run recovery ID, if different from the one in config
    :param epoch: epoch, if different from the one in config
    :param download_to_folder: The root folder in which all downloaded files should be stored. Point to an existing
    folder with downloaded files for use in unit tests. If not provided, the files will be downloaded to a new folder
    inside the config.outputs_directory, with the name taken from the run ID.
    :param splits_to_evaluate: If supplied, use these values as the split indices to download. Use only for
    unit testing.
    :return: The dataframe with all of the downloaded results grouped by execution mode (Test or Val)
     and directory where the epoch results were downloaded to.
    """
    splits_to_evaluate = splits_to_evaluate or []
    if run_recovery_id is None:
        run_recovery_id = config.run_recovery_id
    if epoch is None:
        epoch = config.epoch
    if run_recovery_id:
        workspace = config.azure_config.get_workspace()
        parent = fetch_run(workspace, run_recovery_id)
        runs_to_evaluate = fetch_child_runs(
            run=parent, expected_number_cross_validation_splits=config.number_of_cross_validation_splits)
        logging.info("Adding parent run to the list of runs to evaluate.")
        runs_to_evaluate.append(parent)
        logging.info(f"Will evaluate results for runs: {[x.id for x in runs_to_evaluate]}")
    else:
        runs_to_evaluate = []
    # create the root path to store the outputs
    if not download_to_folder:
        download_to_folder = Path(config.outputs_directory) / CROSSVAL_RESULTS_FOLDER
        # Make the folder if it doesn't exist, but preserve any existing contents.
        download_to_folder.mkdir(parents=True, exist_ok=True)
    start_time = time.time()
    logging.info(f"Starting to download files for cross validation analysis to: {download_to_folder}")
    assert download_to_folder is not None
    result: List[RunResultFiles] = []
    loop_over: List[Tuple[Optional[Run], str, str, Optional[str]]]
    if splits_to_evaluate:
        loop_over = [(None, split, split, "") for split in splits_to_evaluate]
    else:
        loop_over = []
        for run in runs_to_evaluate:
            tags = run.get_tags()
            if is_parent_run(run):
                split_index = ENSEMBLE_SPLIT_NAME
            else:
                split_index = get_split_id(tags, config.is_zero_index)
            split_suffix = split_index
            # Value to put in the "Split" column in the result.
            run_recovery_id = tags[RUN_RECOVERY_ID_KEY]
            loop_over.append((run, split_index, split_suffix, run_recovery_id))

    for run, split_index, split_suffix, run_recovery_id in loop_over:
        if run is not None:
            config.get_short_name(run)
        config.local_run_result_split_suffix = split_suffix
        # When run is the parent run, we need to look on the local disc.
        # If (as expected) dataset.csv is not already present, we copy it from the top of the outputs directory.
        folder_for_run = download_to_folder / split_suffix
        dataset_file: Optional[Path]
        if is_parent_run(run):
            folder_for_run.mkdir(parents=True, exist_ok=True)
            dataset_file = folder_for_run / DATASET_CSV_FILE_NAME
            # Copy the run-0 dataset.csv, which should be the same, as the parent run won't have one.
            shutil.copy(str(Path(config.outputs_directory) / DATASET_CSV_FILE_NAME), str(dataset_file))
        else:
            dataset_file = config.download_or_get_local_file(run, DATASET_CSV_FILE_NAME, folder_for_run)
        if config.model_category == ModelCategory.Segmentation and not dataset_file:
            raise ValueError(f"Dataset file must be present for segmentation models, but is missing for run {run.id}")
        # Get metrics files.
        for mode in config.execution_modes_to_download():
            # download metrics.csv file for each split. metrics_file can be None if the file does not exist
            # (for example, if no output was written for execution mode Test)
            metrics_file = download_metrics_file(config, run, folder_for_run, epoch, mode)
            if metrics_file:
                result.append(RunResultFiles(execution_mode=mode,
                                             dataset_csv_file=dataset_file,
                                             metrics_file=metrics_file,
                                             run_recovery_id=run_recovery_id,
                                             split_index=split_index))
    elapsed = time.time() - start_time
    logging.info(f"Finished downloading files. Total time to download: {elapsed:0.2f}sec")
    return result, download_to_folder