def download_and_compare_scores( outputs_folder: Path, azure_config: AzureConfig, comparison_blob_storage_paths: List[Tuple[str, str]], model_dataset_df: pd.DataFrame, model_metrics_df: pd.DataFrame) -> DiceScoreComparisonResult: """ :param azure_config: Azure configuration to use for downloading data :param comparison_blob_storage_paths: list of paths to directories containing metrics.csv and dataset.csv files, each of the form run_recovery_id/rest_of_path :param model_dataset_df: dataframe containing contents of dataset.csv for the current model :param model_metrics_df: dataframe containing contents of metrics.csv for the current model :return: a dataframe for all the data (current model and all baselines); whether any comparisons were done, i.e. whether a valid baseline was found; and the text lines to be written to the Wilcoxon results file. """ comparison_baselines = get_comparison_baselines( outputs_folder, azure_config, comparison_blob_storage_paths) result = perform_score_comparisons(model_dataset_df, model_metrics_df, comparison_baselines) for baseline in comparison_baselines: run_rec_path = outputs_folder / baseline.run_recovery_id if run_rec_path.exists(): logging.info(f"Removing directory {run_rec_path}") remove_file_or_directory(run_rec_path) return result
def create_ensemble_model(self) -> None: """ Call MLRunner again after training cross-validation models, to create an ensemble model from them. """ # Import only here in case of dependency issues in reduced environment from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler # Adjust parameters self.azure_config.hyperdrive = False self.model_config.number_of_cross_validation_splits = 0 self.model_config.is_train = False with logging_section("Downloading checkpoints from sibling runs"): checkpoint_handler = CheckpointHandler( model_config=self.model_config, azure_config=self.azure_config, project_root=self.project_root, run_context=PARENT_RUN_CONTEXT) checkpoint_handler.discover_and_download_checkpoint_from_sibling_runs( output_subdir_name=OTHER_RUNS_SUBDIR_NAME) best_epoch = self.create_ml_runner().run_inference_and_register_model( checkpoint_handler=checkpoint_handler, model_proc=ModelProcessing.ENSEMBLE_CREATION) crossval_dir = self.plot_cross_validation_and_upload_results() Runner.generate_report(self.model_config, best_epoch, ModelProcessing.ENSEMBLE_CREATION) # CrossValResults should have been uploaded to the parent run, so we don't need it here. remove_file_or_directory(crossval_dir) # We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files # available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE. other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME if PARENT_RUN_CONTEXT is not None: if other_runs_ensemble_dir.exists(): # Only keep baseline Wilcoxon results and scatterplots and reports for subdir in other_runs_ensemble_dir.glob("*"): if subdir.name not in [ BASELINE_WILCOXON_RESULTS_FILE, SCATTERPLOTS_SUBDIR_NAME, REPORT_HTML, REPORT_IPYNB ]: remove_file_or_directory(subdir) PARENT_RUN_CONTEXT.upload_folder( name=BASELINE_COMPARISONS_FOLDER, path=str(other_runs_ensemble_dir)) else: logging.warning( f"Directory not found for upload: {other_runs_ensemble_dir}" ) remove_file_or_directory(other_runs_dir)
def create_ensemble_model(self) -> None: """ Create an ensemble model from the results of the sibling runs of the present run. The present run here will be cross validation child run 0. """ assert PARENT_RUN_CONTEXT, "This function should only be called in a Hyperdrive run" with logging_section("Downloading checkpoints from sibling runs"): checkpoint_handler = CheckpointHandler( model_config=self.model_config, azure_config=self.azure_config, project_root=self.project_root, run_context=PARENT_RUN_CONTEXT) checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs( PARENT_RUN_CONTEXT) self.run_inference_and_register_model( checkpoint_handler=checkpoint_handler, model_proc=ModelProcessing.ENSEMBLE_CREATION) crossval_dir = self.plot_cross_validation_and_upload_results() self.generate_report(ModelProcessing.ENSEMBLE_CREATION) # CrossValResults should have been uploaded to the parent run, so we don't need it here. remove_file_or_directory(crossval_dir) # We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files # available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE. other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME if PARENT_RUN_CONTEXT is not None: if other_runs_ensemble_dir.exists(): # Only keep baseline Wilcoxon results and scatterplots and reports for subdir in other_runs_ensemble_dir.glob("*"): if subdir.name not in [ BASELINE_WILCOXON_RESULTS_FILE, SCATTERPLOTS_SUBDIR_NAME, REPORT_HTML, REPORT_IPYNB ]: remove_file_or_directory(subdir) PARENT_RUN_CONTEXT.upload_folder( name=BASELINE_COMPARISONS_FOLDER, path=str(other_runs_ensemble_dir)) else: logging.warning( f"Directory not found for upload: {other_runs_ensemble_dir}" ) remove_file_or_directory(other_runs_dir)