def generate_report(config: DeepLearningConfig, best_epoch: int, model_proc: ModelProcessing) -> None: logging.info("Saving report in html") if config.model_category not in [ModelCategory.Segmentation, ModelCategory.Classification]: return try: def get_epoch_path(mode: ModelExecutionMode) -> Path: p = get_epoch_results_path(best_epoch, mode=mode, model_proc=model_proc) return config.outputs_folder / p / METRICS_FILE_NAME path_to_best_epoch_train = get_epoch_path(ModelExecutionMode.TRAIN) path_to_best_epoch_val = get_epoch_path(ModelExecutionMode.VAL) path_to_best_epoch_test = get_epoch_path(ModelExecutionMode.TEST) output_dir = config.outputs_folder / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME \ if model_proc == ModelProcessing.ENSEMBLE_CREATION else config.outputs_folder if config.model_category == ModelCategory.Segmentation: generate_segmentation_notebook(result_notebook=output_dir / REPORT_IPYNB, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) else: if isinstance(config, ScalarModelBase): generate_classification_notebook(result_notebook=output_dir / REPORT_IPYNB, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test, dataset_csv_path=config.local_dataset / DATASET_CSV_FILE_NAME if config.local_dataset else None, dataset_subject_column=config.subject_column, dataset_file_column=config.image_file_column) else: logging.info(f"Cannot create report for config of type {type(config)}.") except Exception as ex: print_exception(ex, "Failed to generated reporting notebook.")
def plot_cross_validation_and_upload_results(self) -> Path: from InnerEye.ML.visualizers.plot_cross_validation import crossval_config_from_model_config, \ plot_cross_validation, unroll_aggregate_metrics # perform aggregation as cross val splits are now ready plot_crossval_config = crossval_config_from_model_config( self.model_config) plot_crossval_config.run_recovery_id = PARENT_RUN_CONTEXT.tags[ RUN_RECOVERY_ID_KEY_NAME] plot_crossval_config.outputs_directory = self.model_config.outputs_folder plot_crossval_config.settings_yaml_file = self.yaml_config_file cross_val_results_root = plot_cross_validation(plot_crossval_config) if self.post_cross_validation_hook: self.post_cross_validation_hook(self.model_config, cross_val_results_root) # upload results to the parent run's outputs. Normally, we use blobxfer for that, but here we want # to ensure that the files are visible inside the AzureML UI. PARENT_RUN_CONTEXT.upload_folder(name=CROSSVAL_RESULTS_FOLDER, path=str(cross_val_results_root)) if self.model_config.is_scalar_model: try: aggregates = pd.read_csv(cross_val_results_root / METRICS_AGGREGATES_FILE) unrolled_aggregate_metrics = unroll_aggregate_metrics( aggregates) for m in unrolled_aggregate_metrics: PARENT_RUN_CONTEXT.log(m.metric_name, m.metric_value) except Exception as ex: print_exception( ex, "Unable to log metrics to Hyperdrive parent run.", logger_fn=logging.warning) return cross_val_results_root
def create_and_set_torch_datasets(self, for_training: bool = True, for_inference: bool = True) -> None: """ Creates and sets torch datasets for all model execution modes, and stores them in the self._datasets field. It also calls the hook to compute statistics for the train/val/test datasets. """ # For models other than segmentation models, it is easier to create both training and inference datasets # in one go, ignoring the arguments. if self._datasets_for_training is None and self._datasets_for_inference is None: datasets = self.create_torch_datasets(self.get_dataset_splits()) self._datasets_for_training = { mode: datasets[mode] for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL] } self._datasets_for_inference = datasets for split, dataset in datasets.items(): logging.info( f"{split.value}: {len(dataset)} subjects. Detailed status: {dataset.status}" ) if self.dataset_stats_hook: try: self.dataset_stats_hook(datasets) except Exception as ex: print_exception( ex, message= "Error while calling the hook for computing dataset statistics." )
def generate_report(config: DeepLearningConfig, best_epoch: int, model_proc: ModelProcessing) -> None: logging.info("Saving report in html") if not config.is_segmentation_model: return try: def get_epoch_path(mode: ModelExecutionMode) -> Path: p = get_epoch_results_path(best_epoch, mode=mode, model_proc=model_proc) return config.outputs_folder / p / METRICS_FILE_NAME path_to_best_epoch_train = get_epoch_path(ModelExecutionMode.TRAIN) path_to_best_epoch_val = get_epoch_path(ModelExecutionMode.VAL) path_to_best_epoch_test = get_epoch_path(ModelExecutionMode.TEST) output_dir = config.outputs_folder / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME \ if model_proc == ModelProcessing.ENSEMBLE_CREATION else config.outputs_folder generate_segmentation_notebook( result_notebook=output_dir / REPORT_IPYNB, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) except Exception as ex: print_exception(ex, "Failed to generated reporting notebook.")
def test_print_exception() -> None: """ A test that just throws an exception, and allows to check if the diagnostics are at the right level. You need to inspect the test output manually. """ try: raise ValueError("foo") except Exception as ex: print_exception(ex, "Message")
def try_compare_scores_against_baselines(self, model_proc: ModelProcessing) -> None: """ Attempt comparison of scores against baseline scores and scatterplot creation if possible. """ if not isinstance(self.model_config, SegmentationModelBase): # keep type checker happy return try: from InnerEye.ML.baselines_util import compare_scores_against_baselines with logging_section("Comparing scores against baselines"): compare_scores_against_baselines(self.model_config, self.azure_config, model_proc) except Exception as ex: print_exception(ex, "Model baseline comparison failed.")
def run_in_situ(self) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(self.azure_config.log_level) suppress_logging_noise() pytest_failed = False training_failed = False pytest_passed = True # Ensure that both model training and pytest both get executed in all cases, so that we see a full set of # test results in each PR outputs_folder = self.model_config.outputs_folder try: logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME) try: self.create_ml_runner().run() except Exception as ex: print_exception(ex, "Model training/testing failed.") training_failed = True if self.azure_config.pytest_mark: try: pytest_passed, results_file_path = run_pytest( self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: logging.error( f"Not all PyTest tests passed. See {results_file_path}" ) except Exception as ex: print_exception(ex, "Unable to run PyTest.") pytest_failed = True finally: # wait for aggregation if required, and only if the training actually succeeded. if not training_failed and self.model_config.should_wait_for_other_cross_val_child_runs( ): self.wait_for_cross_val_runs_to_finish_and_aggregate() disable_logging_to_file() message = [] if training_failed: message.append("Training failed") if pytest_failed: message.append("Unable to run Pytest") if not pytest_passed: message.append("At least 1 test in Pytest failed") # Terminate if pytest or model training has failed. This makes the smoke test in # PR builds fail if pytest fails. if message: raise ValueError( f"One component of the training pipeline failed: {'. '.join(message)}" )
def generate_report(self, model_proc: ModelProcessing) -> None: config = self.model_config if config.model_category not in [ModelCategory.Segmentation, ModelCategory.Classification]: logging.info(f"No reporting available for a model with category {config.model_category}") return logging.info("Saving report in HTML") try: def get_epoch_path(mode: ModelExecutionMode) -> Path: p = get_epoch_results_path(mode=mode, model_proc=model_proc) return config.outputs_folder / p / SUBJECT_METRICS_FILE_NAME path_to_best_epoch_train = get_epoch_path(ModelExecutionMode.TRAIN) path_to_best_epoch_val = get_epoch_path(ModelExecutionMode.VAL) path_to_best_epoch_test = get_epoch_path(ModelExecutionMode.TEST) output_dir = config.outputs_folder / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME \ if model_proc == ModelProcessing.ENSEMBLE_CREATION else config.outputs_folder reports_dir = output_dir / reports_folder if not reports_dir.exists(): reports_dir.mkdir(exist_ok=False) if config.model_category == ModelCategory.Segmentation: generate_segmentation_notebook( result_notebook=reports_dir / get_ipynb_report_name(config.model_category.value), train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) else: if isinstance(config, ScalarModelBase) and not isinstance(config, SequenceModelBase): generate_classification_notebook( result_notebook=reports_dir / get_ipynb_report_name(config.model_category.value), config=config, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) if len(config.class_names) > 1: generate_classification_multilabel_notebook( result_notebook=reports_dir / get_ipynb_report_name(f"{config.model_category.value}_multilabel"), config=config, train_metrics=path_to_best_epoch_train, val_metrics=path_to_best_epoch_val, test_metrics=path_to_best_epoch_test) else: logging.info(f"Cannot create report for config of type {type(config)}.") except Exception as ex: print_exception(ex, "Failed to generated reporting notebook.") raise
def download_dataset(azure_dataset_id: str, target_folder: Path, azure_config: AzureConfig) -> Path: """ Downloads or checks for an existing dataset on the executing machine. If a local_dataset is supplied and the directory is present, return that. Otherwise, download the dataset specified by the azure_dataset_id from the AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`, in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder contains a dataset.csv file, no download is started. :param local_dataset: The path to an existing local dataset. :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace. :param target_folder: The folder in which to download the dataset from Azure. :param azure_config: All Azure-related configuration options. :return: A path on the local machine that contains the dataset. """ workspace = azure_config.get_workspace() try: downloaded_via_blobxfer = download_dataset_via_blobxfer( dataset_id=azure_dataset_id, azure_config=azure_config, target_folder=target_folder) if downloaded_via_blobxfer: return downloaded_via_blobxfer except Exception as ex: print_exception(ex, message="Unable to download dataset via blobxfer.") logging.info("Trying to download dataset via AzureML datastore now.") azure_dataset = get_or_create_dataset(workspace, azure_dataset_id) if not isinstance(azure_dataset, FileDataset): raise ValueError( f"Expected to get a FileDataset, but got {type(azure_dataset)}") # The downloaded dataset may already exist from a previous run. expected_dataset_path = target_folder / azure_dataset_id expected_dataset_file = expected_dataset_path / DATASET_CSV_FILE_NAME logging.info( f"Model training will use dataset '{azure_dataset_id}' in Azure.") if expected_dataset_path.is_dir() and expected_dataset_file.is_file(): logging.info( f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping." ) return expected_dataset_path logging.info( "Starting to download the dataset - WARNING, this could take very long!" ) with logging_section("Downloading dataset"): azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False) logging.info( f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}" ) return expected_dataset_path
def run_in_situ(self) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(self.azure_config.log_level) suppress_logging_noise() error_messages = [] # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the # large models. if self.azure_config.pytest_mark: try: outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: pytest_failures = f"Not all PyTest tests passed. See {results_file_path}" logging.error(pytest_failures) error_messages.append(pytest_failures) except Exception as ex: print_exception(ex, "Unable to run PyTest.") error_messages.append(f"Unable to run PyTest: {ex}") else: # Set environment variables for multi-node training if needed. # In particular, the multi-node environment variables should NOT be set in single node # training, otherwise this might lead to errors with the c10 distributed backend # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395) if self.azure_config.num_nodes > 1: set_environment_variables_for_multi_node() try: logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME) try: self.create_ml_runner().run() except Exception as ex: print_exception(ex, "Model training/testing failed.") error_messages.append(f"Training failed: {ex}") finally: disable_logging_to_file() # Terminate if pytest or model training has failed. This makes the smoke test in # PR builds fail if pytest fails. if error_messages: raise ValueError( f"At least one component of the runner failed: {os.linesep} {os.linesep.join(error_messages)}")