Ejemplo n.º 1
0
def download_metrics_file(config: PlotCrossValidationConfig,
                          run: Run,
                          destination: Path,
                          epoch: Optional[int],
                          mode: ModelExecutionMode) -> Optional[Path]:
    """
    Downloads a metrics.csv file from an Azure run (or local results), and stores it in a local folder.
    The metrics.csv file will be written into a subfolder named after the model execution mode.
    :param config: The cross validation configuration.
    :param run: The AzureML run to download from.
    :param destination: The folder to download into.
    :param epoch: The epoch that plot_cross_validation is running for. This is mandatory for segmentation models,
    and ignored for classification models.
    :param mode: The dataset split to read from.
    :return: The path to the local file, or None if no metrics.csv file was found.
    """
    # setup the appropriate paths and readers for the metrics
    if config.model_category == ModelCategory.Segmentation:
        if epoch is None:
            raise ValueError("Epoch must be provided in segmentation runs")
        src = get_epoch_results_path(epoch, mode) / METRICS_FILE_NAME
    else:
        src = Path(mode.value) / METRICS_FILE_NAME

    # download (or copy from local disc) subject level metrics for the given epoch
    local_src_subdir = Path(OTHER_RUNS_SUBDIR_NAME) / ENSEMBLE_SPLIT_NAME if is_parent_run(run) else None
    return config.download_or_get_local_file(
        blob_to_download=src,
        destination=destination,
        run=run,
        local_src_subdir=local_src_subdir)
def download_metrics_file(config: PlotCrossValidationConfig, run: Run,
                          destination: Path,
                          mode: ModelExecutionMode) -> Optional[Path]:
    """
    Downloads a metrics.csv file from an Azure run (or local results), and stores it in a local folder.
    The metrics.csv file will be written into a subfolder named after the model execution mode.
    :param config: The cross validation configuration.
    :param run: The AzureML run to download from.
    :param destination: The folder to download into.
    :param mode: The dataset split to read from.
    :return: The path to the local file, or None if no metrics.csv file was found.
    """
    # setup the appropriate paths and readers for the metrics.
    # For classification models:
    #           For train / val: we save metrics during training for all epochs in output / mode folder.
    #           For test / ensemble: we save metrics in get_epoch_results_path(mode) after running inference on the
    #           best epoch.
    # For segmentation models: we save all metrics in get_epoch_results_path(mode) after running inference on the
    #                          best epoch.
    # For all models metrics are gathered in the CROSS_VAL_FOLDER / mode

    is_ensemble_run = is_parent_run(run)
    local_src = None
    if config.model_category == ModelCategory.Segmentation or is_ensemble_run or mode == ModelExecutionMode.TEST:
        local_src = get_best_epoch_results_path(
            mode,
            model_proc=ModelProcessing.ENSEMBLE_CREATION
            if is_ensemble_run else ModelProcessing.DEFAULT).parent
    logging.info(f"Local_src contains {local_src}")
    src = Path(mode.value) / SUBJECT_METRICS_FILE_NAME
    return config.download_or_get_local_file(blob_to_download=src,
                                             destination=destination,
                                             run=run,
                                             local_src_subdir=local_src)
Ejemplo n.º 3
0
 def download_or_get_local_file(self,
                                run: Optional[Run],
                                blob_to_download: PathOrString,
                                destination: Path,
                                local_src_subdir: Optional[Path] = None) -> Optional[Path]:
     """
     Downloads a file from the results folder of an AzureML run, or copies it from a local results folder.
     Returns the path to the downloaded file if it exists, or None if the file was not found.
     If the blobs_path contains folders, the same folder structure will be created inside the destination folder.
     For example, downloading "foo.txt" to "/c/temp" will create "/c/temp/foo.txt". Downloading "foo/bar.txt"
     to "/c/temp" will create "/c/temp/foo/bar.txt"
     :param blob_to_download: path of data to download within the run
     :param destination: directory to write to
     :param run: The AzureML run to download from.
     :param local_src_subdir: if not None, then if we copy from a local results folder, that folder is
     self.outputs_directory/local_src_subdir/blob_to_download instead of self.outputs_directory/blob_to_download
     :return: The path to the downloaded file, or None if the file was not found.
     """
     blob_path = Path(blob_to_download)
     blob_parent = blob_path.parent
     if blob_parent != Path("."):
         destination = destination / blob_parent
     downloaded_file = destination / blob_path.name
     # If we've already downloaded the data, leave it as it is
     if downloaded_file.exists():
         logging.info(f"Download of '{blob_path}' to '{downloaded_file}: not needed, already exists'")
         return downloaded_file
     logging.info(f"Download of '{blob_path}' to '{downloaded_file}': proceeding")
     # If the provided run is the current run, then there is nothing to download.
     # Just copy the provided path in the outputs directory to the destination.
     if not destination.exists():
         destination.mkdir(parents=True)
     if run is None or Run.get_context().id == run.id or is_parent_run(run) or is_offline_run_context(run):
         if run is None:
             assert self.local_run_results is not None, "Local run results must be set in unit testing"
             local_src = Path(self.local_run_results)
             if self.local_run_result_split_suffix:
                 local_src = local_src / self.local_run_result_split_suffix
         else:
             local_src = Path(self.outputs_directory)
         if local_src_subdir is not None:
             local_src = local_src / local_src_subdir
         local_src = local_src / blob_path
         if local_src.exists():
             logging.info(f"Copying files from {local_src} to {destination}")
             return Path(shutil.copy(local_src, destination))
         return None
     else:
         try:
             return download_outputs_from_run(
                 blobs_path=blob_path,
                 destination=destination,
                 run=run,
                 is_file=True
             )
         except Exception as ex:
             logging.warning(f"File {blob_to_download} not found in output of run {run.id}: {ex}")
             return None
Ejemplo n.º 4
0
def download_crossval_result_files(config: PlotCrossValidationConfig,
                                   run_recovery_id: Optional[str] = None,
                                   epoch: Optional[int] = None,
                                   download_to_folder: Optional[Path] = None,
                                   splits_to_evaluate: Optional[List[str]] = None) -> Tuple[List[RunResultFiles], Path]:
    """
    Given an AzureML run, downloads all files that are necessary for doing an analysis of cross validation runs.
    It will download the metrics.csv file for each dataset split (,Test, Val) and all of the run's children.
    When running in segmentation mode, it also downloads the dataset.csv and adds the institutionId and seriesId
    information for each subject found in the metrics files.
    :param config: PlotCrossValidationConfig
    :param run_recovery_id: run recovery ID, if different from the one in config
    :param epoch: epoch, if different from the one in config
    :param download_to_folder: The root folder in which all downloaded files should be stored. Point to an existing
    folder with downloaded files for use in unit tests. If not provided, the files will be downloaded to a new folder
    inside the config.outputs_directory, with the name taken from the run ID.
    :param splits_to_evaluate: If supplied, use these values as the split indices to download. Use only for
    unit testing.
    :return: The dataframe with all of the downloaded results grouped by execution mode (Test or Val)
     and directory where the epoch results were downloaded to.
    """
    splits_to_evaluate = splits_to_evaluate or []
    if run_recovery_id is None:
        run_recovery_id = config.run_recovery_id
    if epoch is None:
        epoch = config.epoch
    if run_recovery_id:
        workspace = config.azure_config.get_workspace()
        parent = fetch_run(workspace, run_recovery_id)
        runs_to_evaluate = fetch_child_runs(
            run=parent, expected_number_cross_validation_splits=config.number_of_cross_validation_splits)
        logging.info("Adding parent run to the list of runs to evaluate.")
        runs_to_evaluate.append(parent)
        logging.info(f"Will evaluate results for runs: {[x.id for x in runs_to_evaluate]}")
    else:
        runs_to_evaluate = []
    # create the root path to store the outputs
    if not download_to_folder:
        download_to_folder = Path(config.outputs_directory) / CROSSVAL_RESULTS_FOLDER
        # Make the folder if it doesn't exist, but preserve any existing contents.
        download_to_folder.mkdir(parents=True, exist_ok=True)
    start_time = time.time()
    logging.info(f"Starting to download files for cross validation analysis to: {download_to_folder}")
    assert download_to_folder is not None
    result: List[RunResultFiles] = []
    loop_over: List[Tuple[Optional[Run], str, str, Optional[str]]]
    if splits_to_evaluate:
        loop_over = [(None, split, split, "") for split in splits_to_evaluate]
    else:
        loop_over = []
        for run in runs_to_evaluate:
            tags = run.get_tags()
            if is_parent_run(run):
                split_index = ENSEMBLE_SPLIT_NAME
            else:
                split_index = get_split_id(tags, config.is_zero_index)
            split_suffix = split_index
            # Value to put in the "Split" column in the result.
            run_recovery_id = tags[RUN_RECOVERY_ID_KEY]
            loop_over.append((run, split_index, split_suffix, run_recovery_id))

    for run, split_index, split_suffix, run_recovery_id in loop_over:
        if run is not None:
            config.get_short_name(run)
        config.local_run_result_split_suffix = split_suffix
        # When run is the parent run, we need to look on the local disc.
        # If (as expected) dataset.csv is not already present, we copy it from the top of the outputs directory.
        folder_for_run = download_to_folder / split_suffix
        dataset_file: Optional[Path]
        if is_parent_run(run):
            folder_for_run.mkdir(parents=True, exist_ok=True)
            dataset_file = folder_for_run / DATASET_CSV_FILE_NAME
            # Copy the run-0 dataset.csv, which should be the same, as the parent run won't have one.
            shutil.copy(str(Path(config.outputs_directory) / DATASET_CSV_FILE_NAME), str(dataset_file))
        else:
            dataset_file = config.download_or_get_local_file(run, DATASET_CSV_FILE_NAME, folder_for_run)
        if config.model_category == ModelCategory.Segmentation and not dataset_file:
            raise ValueError(f"Dataset file must be present for segmentation models, but is missing for run {run.id}")
        # Get metrics files.
        for mode in config.execution_modes_to_download():
            # download metrics.csv file for each split. metrics_file can be None if the file does not exist
            # (for example, if no output was written for execution mode Test)
            metrics_file = download_metrics_file(config, run, folder_for_run, epoch, mode)
            if metrics_file:
                result.append(RunResultFiles(execution_mode=mode,
                                             dataset_csv_file=dataset_file,
                                             metrics_file=metrics_file,
                                             run_recovery_id=run_recovery_id,
                                             split_index=split_index))
    elapsed = time.time() - start_time
    logging.info(f"Finished downloading files. Total time to download: {elapsed:0.2f}sec")
    return result, download_to_folder