def download_checkpoints_from_run( azure_config: AzureConfig, config: ModelConfigBase, run: Run, output_subdir_name: Optional[str] = None) -> RunRecovery: """ Downloads checkpoints of the provided run or, if applicable, its children. :param azure_config: Azure related configs. :param config: Model related configs. :param run: Run whose checkpoints should be recovered :return: run recovery information """ child_runs: List[Run] = fetch_child_runs(run) logging.debug(f"Run has ID {run.id} and initial child runs are:") for child_run in child_runs: logging.debug(f" {child_run.id}") checkpoint_subdir_name: Optional[str] if output_subdir_name: # From e.g. parent_dir/checkpoints we want parent_dir/output_subdir_name, to which we will # append split_index / checkpoints below to create child_dst. checkpoint_path = Path(config.checkpoint_folder) parent_path = checkpoint_path.parent checkpoint_subdir_name = checkpoint_path.name root_output_dir = parent_path / output_subdir_name else: root_output_dir = Path(config.checkpoint_folder) / run.id checkpoint_subdir_name = None # download checkpoints for the run download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER), destination=root_output_dir, run=run) if len(child_runs) > 0: tag_to_use = 'cross_validation_split_index' can_use_split_indices = tag_values_all_distinct( child_runs, tag_to_use) # download checkpoints for the child runs in the root of the parent child_runs_checkpoints_roots: List[Path] = [] for child in child_runs: if child.id == RUN_CONTEXT.id: # We expect to find the file(s) we need in config.checkpoint_folder child_dst = Path(config.checkpoint_folder) else: subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number) if checkpoint_subdir_name: child_dst = root_output_dir / subdir / checkpoint_subdir_name else: child_dst = root_output_dir / subdir download_outputs_from_run( blobs_path=Path(CHECKPOINT_FOLDER), destination=child_dst, run=child) child_runs_checkpoints_roots.append(child_dst) return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots) else: return RunRecovery(checkpoints_roots=[root_output_dir])
def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig, comparison_blob_storage_paths: List[Tuple[str, str]]) -> \ List[ComparisonBaseline]: workspace = azure_config.get_workspace() comparison_baselines = [] for (comparison_name, comparison_path) in comparison_blob_storage_paths: # Discard the experiment part of the run rec ID, if any. comparison_path = comparison_path.split(":")[-1] run_rec_id, blob_path_str = comparison_path.split("/", 1) run_rec_id = strip_prefix(run_rec_id, AZUREML_RUN_FOLDER_PREFIX) blob_path = Path(strip_prefix(blob_path_str, DEFAULT_AML_UPLOAD_DIR + "/")) run = fetch_run(workspace, run_rec_id) # We usually find dataset.csv in the same directory as metrics.csv, but we sometimes # have to look higher up. comparison_dataset_path: Optional[Path] = None comparison_metrics_path: Optional[Path] = None destination_folder = outputs_folder / run_rec_id / blob_path # Look for dataset.csv inside epoch_NNN/Test, epoch_NNN/ and at top level for blob_path_parent in step_up_directories(blob_path): try: comparison_dataset_path = download_outputs_from_run( blob_path_parent / DATASET_CSV_FILE_NAME, destination_folder, run, True) break except ValueError: logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at {blob_path_parent} in {run_rec_id}") pass except NotADirectoryError: logging.warning(f"{blob_path_parent} is not a directory") break if comparison_dataset_path is None: logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at or above {blob_path} in {run_rec_id}") # Look for epoch_NNN/Test/metrics.csv try: comparison_metrics_path = download_outputs_from_run( blob_path / METRICS_FILE_NAME, destination_folder, run, True) except ValueError: logging.warning(f"cannot find {METRICS_FILE_NAME} at {blob_path} in {run_rec_id}") # If both dataset.csv and metrics.csv were downloaded successfully, read their contents and # add a tuple to the comparison data. if comparison_dataset_path is not None and comparison_metrics_path is not None and \ comparison_dataset_path.exists() and comparison_metrics_path.exists(): comparison_baselines.append(ComparisonBaseline( comparison_name, pd.read_csv(comparison_dataset_path), pd.read_csv(comparison_metrics_path), run_rec_id)) else: logging.warning(f"could not find comparison data for run {run_rec_id}") for key, path in ("dataset", comparison_dataset_path), ("metrics", comparison_metrics_path): logging.warning(f"path to {key} data is {path}") # noinspection PyUnresolvedReferences if path is not None and not path.exists(): logging.warning(" ... but it does not exist") return comparison_baselines
def download_or_get_local_file(self, run: Optional[Run], blob_to_download: PathOrString, destination: Path, local_src_subdir: Optional[Path] = None) -> Optional[Path]: """ Downloads a file from the results folder of an AzureML run, or copies it from a local results folder. Returns the path to the downloaded file if it exists, or None if the file was not found. If the blobs_path contains folders, the same folder structure will be created inside the destination folder. For example, downloading "foo.txt" to "/c/temp" will create "/c/temp/foo.txt". Downloading "foo/bar.txt" to "/c/temp" will create "/c/temp/foo/bar.txt" :param blob_to_download: path of data to download within the run :param destination: directory to write to :param run: The AzureML run to download from. :param local_src_subdir: if not None, then if we copy from a local results folder, that folder is self.outputs_directory/local_src_subdir/blob_to_download instead of self.outputs_directory/blob_to_download :return: The path to the downloaded file, or None if the file was not found. """ blob_path = Path(blob_to_download) blob_parent = blob_path.parent if blob_parent != Path("."): destination = destination / blob_parent downloaded_file = destination / blob_path.name # If we've already downloaded the data, leave it as it is if downloaded_file.exists(): logging.info(f"Download of '{blob_path}' to '{downloaded_file}: not needed, already exists'") return downloaded_file logging.info(f"Download of '{blob_path}' to '{downloaded_file}': proceeding") # If the provided run is the current run, then there is nothing to download. # Just copy the provided path in the outputs directory to the destination. if not destination.exists(): destination.mkdir(parents=True) if run is None or Run.get_context().id == run.id or is_parent_run(run) or is_offline_run_context(run): if run is None: assert self.local_run_results is not None, "Local run results must be set in unit testing" local_src = Path(self.local_run_results) if self.local_run_result_split_suffix: local_src = local_src / self.local_run_result_split_suffix else: local_src = Path(self.outputs_directory) if local_src_subdir is not None: local_src = local_src / local_src_subdir local_src = local_src / blob_path if local_src.exists(): logging.info(f"Copying files from {local_src} to {destination}") return Path(shutil.copy(local_src, destination)) return None else: try: return download_outputs_from_run( blobs_path=blob_path, destination=destination, run=run, is_file=True ) except Exception as ex: logging.warning(f"File {blob_to_download} not found in output of run {run.id}: {ex}") return None
def download_checkpoints_from_run(config: DeepLearningConfig, run: Run) -> RunRecovery: """ Downloads checkpoints of the provided run or, if applicable, its children. When downloading from a run that does not have sibling runs, a single folder inside the checkpoints folder will be created that contains the downloaded checkpoints. When downloading from a run that has sibling runs, the checkpoints for the sibling runs will go into folder 'OTHER_RUNS/<cross_validation_split>' :param config: Model related configs. :param run: Run whose checkpoints should be recovered :return: run recovery information """ # TODO antonsc: Clarify how we handle the case of multiple checkpoint being downloaded. child_runs: List[Run] = fetch_child_runs(run) if child_runs: logging.info(f"Run has ID {run.id}, child runs: {', '.join(c.id for c in child_runs)}") tag_to_use = 'cross_validation_split_index' can_use_split_indices = tag_values_all_distinct(child_runs, tag_to_use) # download checkpoints for the child runs in the root of the parent child_runs_checkpoints_roots: List[Path] = [] for child in child_runs: if child.id == RUN_CONTEXT.id: # We expect to find the file(s) we need in config.checkpoint_folder child_dst = config.checkpoint_folder else: subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number) child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir download_outputs_from_run( blobs_path=Path(CHECKPOINT_FOLDER), destination=child_dst, run=child ) child_runs_checkpoints_roots.append(child_dst) return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots) else: logging.info(f"Run with ID {run.id} has no child runs") root_output_dir = config.checkpoint_folder / run.id # download checkpoints for the run download_outputs_from_run( blobs_path=Path(CHECKPOINT_FOLDER), destination=root_output_dir, run=run ) return RunRecovery(checkpoints_roots=[root_output_dir])
def download_all_checkpoints_from_run(config: DeepLearningConfig, run: Run) -> RunRecovery: """ Downloads all checkpoints of the provided run: The best checkpoint and the recovery checkpoint. A single folder inside the checkpoints folder will be created that contains the downloaded checkpoints. :param config: Model related configs. :param run: Run whose checkpoints should be recovered :return: run recovery information """ if fetch_child_runs(run): raise ValueError( f"AzureML run {run.id} has child runs, this method does not support those." ) root_output_dir = config.checkpoint_folder / run.id download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER), destination=root_output_dir, run=run) return RunRecovery(checkpoints_roots=[root_output_dir])
def download_best_checkpoints_from_child_runs(config: DeepLearningConfig, run: Run) -> RunRecovery: """ Downloads the best checkpoints from all child runs of the provided Hyperdrive parent run. The checkpoints for the sibling runs will go into folder 'OTHER_RUNS/<cross_validation_split>' in the checkpoint folder. There is special treatment for the child run that is equal to the present AzureML run, its checkpoints will be read off the checkpoint folder as-is. :param config: Model related configs. :param run: The Hyperdrive parent run to download from. :return: run recovery information """ child_runs: List[Run] = fetch_child_runs(run) if not child_runs: raise ValueError( f"AzureML run {run.id} does not have any child runs.") logging.info( f"Run {run.id} has {len(child_runs)} child runs: {', '.join(c.id for c in child_runs)}" ) tag_to_use = 'cross_validation_split_index' can_use_split_indices = tag_values_all_distinct(child_runs, tag_to_use) # download checkpoints for the child runs in the root of the parent child_runs_checkpoints_roots: List[Path] = [] for child in child_runs: if child.id == RUN_CONTEXT.id: # We expect to find the file(s) we need in config.checkpoint_folder child_dst = config.checkpoint_folder else: subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number) child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir download_outputs_from_run( blobs_path=Path(CHECKPOINT_FOLDER) / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, destination=child_dst, run=child, is_file=True) child_runs_checkpoints_roots.append(child_dst) return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
def download_all_checkpoints_from_run( config: OutputParams, run: Run, subfolder: Optional[str] = None) -> RunRecovery: """ Downloads all checkpoints of the provided run inside the checkpoints folder. :param config: Model related configs. :param run: Run whose checkpoints should be recovered :param subfolder: optional subfolder name, if provided the checkpoints will be downloaded to CHECKPOINT_FOLDER / subfolder. If None, the checkpoint are downloaded to CHECKPOINT_FOLDER of the current run. :return: run recovery information """ if fetch_child_runs(run): raise ValueError( f"AzureML run {run.id} has child runs, this method does not support those." ) destination_folder = config.checkpoint_folder / subfolder if subfolder else config.checkpoint_folder download_outputs_from_run(blobs_path=Path(CHECKPOINT_FOLDER), destination=destination_folder, run=run) time.sleep(60) # Needed because AML is not fast enough to download return RunRecovery(checkpoints_roots=[destination_folder])
def download_pytest_result( run: Run, destination_folder: Path = Path.cwd()) -> Path: """ Downloads the pytest result file that is stored in the output folder of the given AzureML run. If there is no pytest result file, throw an Exception. :param run: The run from which the files should be read. :param destination_folder: The folder into which the PyTest result file is downloaded. :return: The path (folder and filename) of the downloaded file. """ logging.info(f"Downloading pytest result file: {PYTEST_RESULTS_FILE}") try: return download_outputs_from_run(PYTEST_RESULTS_FILE, destination=destination_folder, run=run, is_file=True) except: raise ValueError( f"No pytest result file {PYTEST_RESULTS_FILE} was found for run {run.id}" )
def load_predictions(run_type: SurfaceDistanceRunType, azure_config: AzureConfig, model_config: SegmentationModelBase, execution_mode: ModelExecutionMode, extended_annotators: List[str], outlier_range: float) -> List[Segmentation]: """ For each run type (IOV or outliers), instantiate a list of predicted Segmentations and return :param run_type: either "iov" or "outliers: :param azure_config: AzureConfig :param model_config: GenericConfig :param execution_mode: ModelExecutionMode: Either Test, Train or Val :param extended_annotators: List of annotators plus model_name to load segmentations for :param outlier_range: The standard deviation from the mean which the points have to be below to be considered an outlier. :return: list of [(subject_id, structure name and dice_scores)] """ predictions = [] if run_type == SurfaceDistanceRunType.OUTLIERS: first_child_run = sd_util.get_first_child_run(azure_config) output_dir = sd_util.get_run_output_dir(azure_config, model_config) metrics_path = sd_util.get_metrics_path(azure_config, model_config) # Load the downloaded metrics CSV as dataframe and determine worst performing outliers for the Test run df = load_csv(metrics_path, [ MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value ]) test_run_df = df[df['mode'] == execution_mode.value] worst_performers = get_worst_performing_outliers( test_run_df, outlier_range, MetricsFileColumns.Dice.value, max_n_outliers=-50) for (subject_id, structure_name, dice_score, _) in worst_performers: subject_prefix = sd_util.get_subject_prefix( model_config, execution_mode, subject_id) # if not already present, download data for subject download_outputs_from_run(blobs_path=subject_prefix, destination=output_dir, run=first_child_run) # check it has been downloaded segmentation_path = output_dir / subject_prefix / f"{structure_name}.nii.gz" predictions.append( Segmentation(structure_name=structure_name, subject_id=subject_id, segmentation_path=segmentation_path, dice_score=float(dice_score))) elif run_type == SurfaceDistanceRunType.IOV: subject_id = 0 iov_dir = Path("outputs") / SurfaceDistanceRunType.IOV.value.lower() all_structs = model_config.class_and_index_with_background() structs_to_plot = [ struct_name for struct_name in all_structs.keys() if struct_name not in ['background', 'external'] ] for annotator in extended_annotators: for struct_name in structs_to_plot: segmentation_path = iov_dir / f"{struct_name + annotator}.nii.gz" if not segmentation_path.is_file(): logging.warning(f"No such file {segmentation_path}") continue predictions.append( Segmentation(structure_name=struct_name, subject_id=subject_id, segmentation_path=segmentation_path, annotator=annotator)) return predictions