예제 #1
0
def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
                             comparison_blob_storage_paths: List[Tuple[str, str]]) -> \
        List[ComparisonBaseline]:
    workspace = azure_config.get_workspace()
    comparison_baselines = []
    for (comparison_name, comparison_path) in comparison_blob_storage_paths:
        # Discard the experiment part of the run rec ID, if any.
        comparison_path = comparison_path.split(":")[-1]
        run_rec_id, blob_path_str = comparison_path.split("/", 1)
        run_rec_id = strip_prefix(run_rec_id, AZUREML_RUN_FOLDER_PREFIX)
        blob_path = Path(strip_prefix(blob_path_str, DEFAULT_AML_UPLOAD_DIR + "/"))
        run = fetch_run(workspace, run_rec_id)
        # We usually find dataset.csv in the same directory as metrics.csv, but we sometimes
        # have to look higher up.
        comparison_dataset_path: Optional[Path] = None
        comparison_metrics_path: Optional[Path] = None
        destination_folder = outputs_folder / run_rec_id / blob_path
        # Look for dataset.csv inside epoch_NNN/Test, epoch_NNN/ and at top level
        for blob_path_parent in step_up_directories(blob_path):
            try:
                comparison_dataset_path = download_outputs_from_run(
                    blob_path_parent / DATASET_CSV_FILE_NAME, destination_folder, run, True)
                break
            except ValueError:
                logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at {blob_path_parent} in {run_rec_id}")
                pass
            except NotADirectoryError:
                logging.warning(f"{blob_path_parent} is not a directory")
                break
            if comparison_dataset_path is None:
                logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at or above {blob_path} in {run_rec_id}")
        # Look for epoch_NNN/Test/metrics.csv
        try:
            comparison_metrics_path = download_outputs_from_run(
                blob_path / METRICS_FILE_NAME, destination_folder, run, True)
        except ValueError:
            logging.warning(f"cannot find {METRICS_FILE_NAME} at {blob_path} in {run_rec_id}")
        # If both dataset.csv and metrics.csv were downloaded successfully, read their contents and
        # add a tuple to the comparison data.
        if comparison_dataset_path is not None and comparison_metrics_path is not None and \
                comparison_dataset_path.exists() and comparison_metrics_path.exists():
            comparison_baselines.append(ComparisonBaseline(
                comparison_name,
                pd.read_csv(comparison_dataset_path),
                pd.read_csv(comparison_metrics_path),
                run_rec_id))
        else:
            logging.warning(f"could not find comparison data for run {run_rec_id}")
            for key, path in ("dataset", comparison_dataset_path), ("metrics", comparison_metrics_path):
                logging.warning(f"path to {key} data is {path}")
                # noinspection PyUnresolvedReferences
                if path is not None and not path.exists():
                    logging.warning("    ... but it does not exist")
    return comparison_baselines
def get_first_child_run(azure_config: AzureConfig) -> Run:
    """
    Download first child run in order to download data
    :param azure_config:
    :return: first child run
    """
    if not azure_config.run_recovery_id:
        raise ValueError("azure_config.run_recovery_id is not provided.")
    workspace = azure_config.get_workspace()
    hyperdrive_run = fetch_run(workspace, azure_config.run_recovery_id)
    child_runs = fetch_child_runs(hyperdrive_run, status=RunStatus.COMPLETED)
    return child_runs[0]
예제 #3
0
def monitor(monitor_config: AMLTensorBoardMonitorConfig,
            azure_config: AzureConfig) -> None:
    """
    Starts TensorBoard monitoring as per the provided arguments.
    :param monitor_config: The config containing information on which runs that need be monitored.
    :param azure_config: An AzureConfig object with secrets/keys to access the workspace.
    """
    # Fetch AzureML workspace and the experiment runs in it
    workspace = azure_config.get_workspace()

    if monitor_config.run_ids is not None:
        if len(monitor_config.run_ids) == 0:
            print("At least one run_recovery_id must be given for monitoring.")
            exit(-1)
        exp_runs = [
            azure_util.fetch_run(workspace, run_id)
            for run_id in monitor_config.run_ids
        ]
    else:
        if monitor_config.experiment_name not in workspace.experiments:
            print(f"The experiment: {monitor_config.experiment_name} doesn't "
                  f"exist in the {monitor_config.workspace_name} workspace.")
            exit(-1)

        experiment = Experiment(workspace, monitor_config.experiment_name)
        filters = common_util.get_items_from_string(
            monitor_config.run_status) if monitor_config.run_status else []

        exp_runs = azure_util.fetch_runs(experiment, filters)

        if len(exp_runs) == 0:
            _msg = "No runs to monitor"
            if monitor_config.run_status:
                _msg += f"with status [{monitor_config.run_status}]."
            exit(-1)

    # Start TensorBoard on executing machine
    ts = Tensorboard(exp_runs,
                     local_root=str(monitor_config.local_root),
                     port=monitor_config.port)

    print(
        "=============================================================================="
    )
    for run in exp_runs:
        print(f"Run URL: {run.get_portal_url()}")
    print("TensorBoard URL: ")
    ts.start()
    print(
        "==============================================================================\n\n"
    )
    input("Press Enter to close TensorBoard...")
    ts.stop()
예제 #4
0
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests,
                                             runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID))
    # recover child runs separately also to test hyperdrive child run recovery functionality
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    for child in child_runs:
        expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file]
        run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child)
        assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)])
        assert all([expected_file.exists() for expected_file in expected_files])
예제 #5
0
def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None:
    """
    Check that metrics.csv and dataset.csv are created after the second epoch, if running on Azure.
    """
    most_recent_run = get_most_recent_run()
    azure_config = AzureConfig.from_yaml(
        fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())
    workspace = azure_config.get_workspace()
    run = fetch_run(workspace, most_recent_run)
    blob_path = get_epoch_results_path(2, ModelExecutionMode.TEST)
    (comparison_dataset_path,
     comparison_metrics_path) = get_comparison_baseline_paths(
         test_output_dirs.root_dir, blob_path, run, DATASET_CSV_FILE_NAME)
    assert comparison_dataset_path is not None
    assert comparison_metrics_path is not None
예제 #6
0
def test_download_or_get_local_blobs(
        is_current_run: bool, test_config: PlotCrossValidationConfig,
        test_output_dirs: OutputFolderForTests) -> None:
    azure_config = get_default_azure_config()
    azure_config.get_workspace()
    assert test_config.run_recovery_id is not None
    run = Run.get_context() if is_current_run else fetch_run(
        azure_config.get_workspace(), test_config.run_recovery_id)
    run_outputs_dir = full_ml_test_data_path() if is_current_run else Path(
        DEFAULT_AML_UPLOAD_DIR)
    test_config.outputs_directory = run_outputs_dir
    dst = test_config.download_or_get_local_file(
        blob_to_download="dataset.csv",
        destination=test_output_dirs.root_dir,
        run=run)
    assert dst is not None
    assert dst.exists()
def test_is_cross_validation_child_run(is_ensemble: bool,
                                       is_numeric: bool) -> None:
    """
    Test that cross validation child runs are identified correctly.
    """
    if is_ensemble:
        rid = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID_NUMERIC if is_numeric else DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    else:
        rid = DEFAULT_RUN_RECOVERY_ID_NUMERIC if is_numeric else DEFAULT_RUN_RECOVERY_ID
    run = fetch_run(workspace=get_default_workspace(), run_recovery_id=rid)
    # check for offline run
    assert not is_cross_validation_child_run(Run.get_context())
    # check for online runs
    assert not is_cross_validation_child_run(run)
    if is_ensemble:
        assert all(
            [is_cross_validation_child_run(x) for x in fetch_child_runs(run)])
def test_get_cross_validation_split_index(is_ensemble: bool) -> None:
    """
    Test that retrieved cross validation split index is as expected, for single runs and ensembles.
    """
    run = fetch_run(workspace=get_default_workspace(),
                    run_recovery_id=DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
                    if is_ensemble else DEFAULT_RUN_RECOVERY_ID)
    # check for offline run
    assert get_cross_validation_split_index(
        Run.get_context()) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    # check for online runs
    assert get_cross_validation_split_index(
        run) == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX
    if is_ensemble:
        assert all([
            get_cross_validation_split_index(x) >
            DEFAULT_CROSS_VALIDATION_SPLIT_INDEX for x in fetch_child_runs(run)
        ])
예제 #9
0
def test_download_checkpoints(test_output_dirs: OutputFolderForTests,
                              is_ensemble: bool,
                              runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    assert get_results_blob_path(
        "some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
    # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there.
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)

    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID
    run_recovery = RunRecovery.download_checkpoints_from_recovery_run(
        runner_config, config)
    run_to_recover = fetch_run(workspace=runner_config.get_workspace(),
                               run_recovery_id=runner_config.run_recovery_id)
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    if is_ensemble:
        child_runs = fetch_child_runs(run_to_recover)
        expected_files = [
            config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME /
            str(x.get_tags()['cross_validation_split_index']) /
            expected_checkpoint_file for x in child_runs
        ]
    else:
        expected_files = [
            config.checkpoint_folder / run_to_recover.id /
            expected_checkpoint_file
        ]

    checkpoint_paths = run_recovery.get_checkpoint_paths(1)
    if is_ensemble:
        assert len(run_recovery.checkpoints_roots) == len(expected_files)
        assert all([(x in [y.parent for y in expected_files])
                    for x in run_recovery.checkpoints_roots])
        assert len(checkpoint_paths) == len(expected_files)
        assert all([x in expected_files for x in checkpoint_paths])
    else:
        assert len(checkpoint_paths) == 1
        assert checkpoint_paths[0] == expected_files[0]

    assert all([expected_file.exists() for expected_file in expected_files])
    def download_checkpoints_from_recovery_run(
            azure_config: AzureConfig,
            config: ModelConfigBase,
            run_context: Optional[Run] = None) -> RunRecovery:
        """
        Downloads checkpoints of run corresponding to the run_recovery_id in azure_config, and any
        checkpoints of the child runs if they exist.

        :param azure_config: Azure related configs.
        :param config: Model related configs.
        :param run_context: Context of the current run (will be used to find the target AML workspace)
        :return:RunRecovery
        """
        run_context = run_context or RUN_CONTEXT
        workspace = azure_config.get_workspace()

        # Find the run to recover in AML workspace
        if not azure_config.run_recovery_id:
            raise ValueError(
                "A valid run_recovery_id is required to download recovery checkpoints, found None"
            )

        run_to_recover = fetch_run(workspace,
                                   azure_config.run_recovery_id.strip())
        # Handle recovery of a HyperDrive cross validation run (from within a successor HyperDrive run,
        # not in ensemble creation). In this case, run_recovery_id refers to the parent prior run, so we
        # need to set run_to_recover to the child of that run whose split index is the same as that of
        # the current (child) run.
        if is_cross_validation_child_run(run_context):
            run_to_recover = next(
                x for x in fetch_child_runs(run_to_recover)
                if get_cross_validation_split_index(
                    x) == get_cross_validation_split_index(run_context))

        return RunRecovery.download_checkpoints_from_run(
            azure_config, config, run_to_recover)
예제 #11
0
def get_run_and_check(run_id: str, expected: bool,
                      workspace: Workspace) -> None:
    run = fetch_run(workspace, run_id)
    status = is_run_and_child_runs_completed(run)
    assert status == expected
예제 #12
0
def download_crossval_result_files(config: PlotCrossValidationConfig,
                                   run_recovery_id: Optional[str] = None,
                                   epoch: Optional[int] = None,
                                   download_to_folder: Optional[Path] = None,
                                   splits_to_evaluate: Optional[List[str]] = None) -> Tuple[List[RunResultFiles], Path]:
    """
    Given an AzureML run, downloads all files that are necessary for doing an analysis of cross validation runs.
    It will download the metrics.csv file for each dataset split (,Test, Val) and all of the run's children.
    When running in segmentation mode, it also downloads the dataset.csv and adds the institutionId and seriesId
    information for each subject found in the metrics files.
    :param config: PlotCrossValidationConfig
    :param run_recovery_id: run recovery ID, if different from the one in config
    :param epoch: epoch, if different from the one in config
    :param download_to_folder: The root folder in which all downloaded files should be stored. Point to an existing
    folder with downloaded files for use in unit tests. If not provided, the files will be downloaded to a new folder
    inside the config.outputs_directory, with the name taken from the run ID.
    :param splits_to_evaluate: If supplied, use these values as the split indices to download. Use only for
    unit testing.
    :return: The dataframe with all of the downloaded results grouped by execution mode (Test or Val)
     and directory where the epoch results were downloaded to.
    """
    splits_to_evaluate = splits_to_evaluate or []
    if run_recovery_id is None:
        run_recovery_id = config.run_recovery_id
    if epoch is None:
        epoch = config.epoch
    if run_recovery_id:
        workspace = config.azure_config.get_workspace()
        parent = fetch_run(workspace, run_recovery_id)
        runs_to_evaluate = fetch_child_runs(
            run=parent, expected_number_cross_validation_splits=config.number_of_cross_validation_splits)
        logging.info("Adding parent run to the list of runs to evaluate.")
        runs_to_evaluate.append(parent)
        logging.info(f"Will evaluate results for runs: {[x.id for x in runs_to_evaluate]}")
    else:
        runs_to_evaluate = []
    # create the root path to store the outputs
    if not download_to_folder:
        download_to_folder = Path(config.outputs_directory) / CROSSVAL_RESULTS_FOLDER
        # Make the folder if it doesn't exist, but preserve any existing contents.
        download_to_folder.mkdir(parents=True, exist_ok=True)
    start_time = time.time()
    logging.info(f"Starting to download files for cross validation analysis to: {download_to_folder}")
    assert download_to_folder is not None
    result: List[RunResultFiles] = []
    loop_over: List[Tuple[Optional[Run], str, str, Optional[str]]]
    if splits_to_evaluate:
        loop_over = [(None, split, split, "") for split in splits_to_evaluate]
    else:
        loop_over = []
        for run in runs_to_evaluate:
            tags = run.get_tags()
            if is_parent_run(run):
                split_index = ENSEMBLE_SPLIT_NAME
            else:
                split_index = get_split_id(tags, config.is_zero_index)
            split_suffix = split_index
            # Value to put in the "Split" column in the result.
            run_recovery_id = tags[RUN_RECOVERY_ID_KEY]
            loop_over.append((run, split_index, split_suffix, run_recovery_id))

    for run, split_index, split_suffix, run_recovery_id in loop_over:
        if run is not None:
            config.get_short_name(run)
        config.local_run_result_split_suffix = split_suffix
        # When run is the parent run, we need to look on the local disc.
        # If (as expected) dataset.csv is not already present, we copy it from the top of the outputs directory.
        folder_for_run = download_to_folder / split_suffix
        dataset_file: Optional[Path]
        if is_parent_run(run):
            folder_for_run.mkdir(parents=True, exist_ok=True)
            dataset_file = folder_for_run / DATASET_CSV_FILE_NAME
            # Copy the run-0 dataset.csv, which should be the same, as the parent run won't have one.
            shutil.copy(str(Path(config.outputs_directory) / DATASET_CSV_FILE_NAME), str(dataset_file))
        else:
            dataset_file = config.download_or_get_local_file(run, DATASET_CSV_FILE_NAME, folder_for_run)
        if config.model_category == ModelCategory.Segmentation and not dataset_file:
            raise ValueError(f"Dataset file must be present for segmentation models, but is missing for run {run.id}")
        # Get metrics files.
        for mode in config.execution_modes_to_download():
            # download metrics.csv file for each split. metrics_file can be None if the file does not exist
            # (for example, if no output was written for execution mode Test)
            metrics_file = download_metrics_file(config, run, folder_for_run, epoch, mode)
            if metrics_file:
                result.append(RunResultFiles(execution_mode=mode,
                                             dataset_csv_file=dataset_file,
                                             metrics_file=metrics_file,
                                             run_recovery_id=run_recovery_id,
                                             split_index=split_index))
    elapsed = time.time() - start_time
    logging.info(f"Finished downloading files. Total time to download: {elapsed:0.2f}sec")
    return result, download_to_folder