def download_and_compare_scores(
        outputs_folder: Path, azure_config: AzureConfig,
        comparison_blob_storage_paths: List[Tuple[str, str]],
        model_dataset_df: pd.DataFrame,
        model_metrics_df: pd.DataFrame) -> DiceScoreComparisonResult:
    """
    :param azure_config: Azure configuration to use for downloading data
    :param comparison_blob_storage_paths: list of paths to directories containing metrics.csv and dataset.csv files,
    each of the form run_recovery_id/rest_of_path
    :param model_dataset_df: dataframe containing contents of dataset.csv for the current model
    :param model_metrics_df: dataframe containing contents of metrics.csv for the current model
    :return: a dataframe for all the data (current model and all baselines); whether any comparisons were
    done, i.e. whether a valid baseline was found; and the text lines to be written to the Wilcoxon results
    file.
    """
    comparison_baselines = get_comparison_baselines(
        outputs_folder, azure_config, comparison_blob_storage_paths)
    result = perform_score_comparisons(model_dataset_df, model_metrics_df,
                                       comparison_baselines)
    for baseline in comparison_baselines:
        run_rec_path = outputs_folder / baseline.run_recovery_id
        if run_rec_path.exists():
            logging.info(f"Removing directory {run_rec_path}")
            remove_file_or_directory(run_rec_path)
    return result
예제 #2
0
    def create_ensemble_model(self) -> None:
        """
        Call MLRunner again after training cross-validation models, to create an ensemble model from them.
        """
        # Import only here in case of dependency issues in reduced environment
        from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
        # Adjust parameters
        self.azure_config.hyperdrive = False
        self.model_config.number_of_cross_validation_splits = 0
        self.model_config.is_train = False

        with logging_section("Downloading checkpoints from sibling runs"):
            checkpoint_handler = CheckpointHandler(
                model_config=self.model_config,
                azure_config=self.azure_config,
                project_root=self.project_root,
                run_context=PARENT_RUN_CONTEXT)
            checkpoint_handler.discover_and_download_checkpoint_from_sibling_runs(
                output_subdir_name=OTHER_RUNS_SUBDIR_NAME)

        best_epoch = self.create_ml_runner().run_inference_and_register_model(
            checkpoint_handler=checkpoint_handler,
            model_proc=ModelProcessing.ENSEMBLE_CREATION)

        crossval_dir = self.plot_cross_validation_and_upload_results()
        Runner.generate_report(self.model_config, best_epoch,
                               ModelProcessing.ENSEMBLE_CREATION)
        # CrossValResults should have been uploaded to the parent run, so we don't need it here.
        remove_file_or_directory(crossval_dir)
        # We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files
        # available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE.
        other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
        other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME
        if PARENT_RUN_CONTEXT is not None:
            if other_runs_ensemble_dir.exists():
                # Only keep baseline Wilcoxon results and scatterplots and reports
                for subdir in other_runs_ensemble_dir.glob("*"):
                    if subdir.name not in [
                            BASELINE_WILCOXON_RESULTS_FILE,
                            SCATTERPLOTS_SUBDIR_NAME, REPORT_HTML, REPORT_IPYNB
                    ]:
                        remove_file_or_directory(subdir)
                PARENT_RUN_CONTEXT.upload_folder(
                    name=BASELINE_COMPARISONS_FOLDER,
                    path=str(other_runs_ensemble_dir))
            else:
                logging.warning(
                    f"Directory not found for upload: {other_runs_ensemble_dir}"
                )
        remove_file_or_directory(other_runs_dir)
예제 #3
0
    def create_ensemble_model(self) -> None:
        """
        Create an ensemble model from the results of the sibling runs of the present run. The present run here will
        be cross validation child run 0.
        """
        assert PARENT_RUN_CONTEXT, "This function should only be called in a Hyperdrive run"
        with logging_section("Downloading checkpoints from sibling runs"):
            checkpoint_handler = CheckpointHandler(
                model_config=self.model_config,
                azure_config=self.azure_config,
                project_root=self.project_root,
                run_context=PARENT_RUN_CONTEXT)
            checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs(
                PARENT_RUN_CONTEXT)

        self.run_inference_and_register_model(
            checkpoint_handler=checkpoint_handler,
            model_proc=ModelProcessing.ENSEMBLE_CREATION)

        crossval_dir = self.plot_cross_validation_and_upload_results()
        self.generate_report(ModelProcessing.ENSEMBLE_CREATION)
        # CrossValResults should have been uploaded to the parent run, so we don't need it here.
        remove_file_or_directory(crossval_dir)
        # We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files
        # available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE.
        other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
        other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME
        if PARENT_RUN_CONTEXT is not None:
            if other_runs_ensemble_dir.exists():
                # Only keep baseline Wilcoxon results and scatterplots and reports
                for subdir in other_runs_ensemble_dir.glob("*"):
                    if subdir.name not in [
                            BASELINE_WILCOXON_RESULTS_FILE,
                            SCATTERPLOTS_SUBDIR_NAME, REPORT_HTML, REPORT_IPYNB
                    ]:
                        remove_file_or_directory(subdir)
                PARENT_RUN_CONTEXT.upload_folder(
                    name=BASELINE_COMPARISONS_FOLDER,
                    path=str(other_runs_ensemble_dir))
            else:
                logging.warning(
                    f"Directory not found for upload: {other_runs_ensemble_dir}"
                )
        remove_file_or_directory(other_runs_dir)