def register_model_for_epoch(self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing) -> None: """ Registers the model in AzureML, with the given set of checkpoints. The AzureML run's tags are updated to describe with information about ensemble creation and the parent run ID. :param checkpoint_paths: The set of Pytorch checkpoints that should be included. :param model_description: A string description of the model, usually containing accuracy numbers. :param model_proc: The type of model that is registered (single or ensemble) """ if not checkpoint_paths: # No point continuing, since no checkpoints were found logging.warning("Abandoning model registration - no valid checkpoint paths found") return if not self.model_config.is_offline_run: split_index = RUN_CONTEXT.get_tags().get(CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None) if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: update_run_tags(RUN_CONTEXT, {IS_ENSEMBLE_KEY_NAME: model_proc == ModelProcessing.ENSEMBLE_CREATION}) elif PARENT_RUN_CONTEXT is not None: update_run_tags(RUN_CONTEXT, {PARENT_RUN_ID_KEY_NAME: PARENT_RUN_CONTEXT.id}) if isinstance(self.model_config, SegmentationModelBase): with logging_section(f"Registering {model_proc.value} model"): self.register_segmentation_model( checkpoint_paths=checkpoint_paths, model_description=model_description, model_proc=model_proc) else: logging.info(f"No deployment done for this type of model: {type(self.model_config)}")
def register_model_for_epoch(self, run_context: Run, checkpoint_handler: CheckpointHandler, best_epoch: int, best_epoch_dice: float, model_proc: ModelProcessing) -> None: checkpoint_path_and_epoch = checkpoint_handler.get_checkpoint_from_epoch( epoch=best_epoch) if not checkpoint_path_and_epoch or not checkpoint_path_and_epoch.checkpoint_paths: # No point continuing, since no checkpoints were found logging.warning( "Abandoning model registration - no valid checkpoint paths found" ) return if not self.model_config.is_offline_run: split_index = run_context.get_tags().get( CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None) if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: update_run_tags( run_context, { IS_ENSEMBLE_KEY_NAME: model_proc == ModelProcessing.ENSEMBLE_CREATION }) elif PARENT_RUN_CONTEXT is not None: update_run_tags( run_context, {PARENT_RUN_ID_KEY_NAME: PARENT_RUN_CONTEXT.id}) with logging_section(f"Registering {model_proc.value} model"): self.register_segmentation_model( run=run_context, best_epoch=best_epoch, best_epoch_dice=best_epoch_dice, checkpoint_paths=checkpoint_path_and_epoch.checkpoint_paths, model_proc=model_proc)
def register_model_for_epoch(self, run_context: Run, run_recovery: Optional[RunRecovery], best_epoch: int, best_epoch_dice: float, model_proc: ModelProcessing) -> None: checkpoint_paths = [self.model_config.get_path_to_checkpoint(best_epoch)] if not run_recovery \ else run_recovery.get_checkpoint_paths(best_epoch) if not self.model_config.is_offline_run: split_index = run_context.get_tags().get( CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None) if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: update_run_tags( run_context, { IS_ENSEMBLE_KEY_NAME: model_proc == ModelProcessing.ENSEMBLE_CREATION }) elif PARENT_RUN_CONTEXT is not None: update_run_tags( run_context, {PARENT_RUN_ID_KEY_NAME: PARENT_RUN_CONTEXT.id}) # Discard any checkpoint paths that do not exist - they will make registration fail. This can happen # when some child runs fail; it may still be worth registering the model. valid_checkpoint_paths = [] for path in checkpoint_paths: if path.exists(): valid_checkpoint_paths.append(path) else: logging.warning( f"Discarding non-existent checkpoint path {path}") if not valid_checkpoint_paths: # No point continuing logging.warning( "Abandoning model registration - no valid checkpoint paths found" ) return with logging_section(f"Registering {model_proc.value} model"): self.register_segmentation_model( run=run_context, best_epoch=best_epoch, best_epoch_dice=best_epoch_dice, checkpoint_paths=valid_checkpoint_paths, model_proc=model_proc)
def register_segmentation_model( self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing ) -> Tuple[Optional[Model], Optional[Any]]: """ Registers a new model in the workspace's model registry to be deployed further, and creates a model zip for portal deployment (if required). This model is the model checkpoint with the highest test accuracy. :param model_description: A string description that is added to the deployed model. It would usually contain the test set performance and information at which epoch the result was achieved. :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML. :param model_proc: whether it's a single or ensemble model. :returns Tuple element 1: AML model object, or None if no model could be registered. Tuple element 2: The result of running the model_deployment_hook, or None if no hook was supplied. """ is_offline_run = is_offline_run_context(RUN_CONTEXT) workspace = None # Terminate early if this is running outside AzureML, and we can't access the AzureML workspace. This # saves time copying around files. if is_offline_run: try: workspace = self.azure_config.get_workspace() except Exception: logging.warning( "Unable to retrieve AzureML workspace. Was the Azure setup completed?" ) logging.info("No model was registered in AzureML.") return None, None # The files for the final model can't live in the outputs folder. If they do: when registering the model, # the files may not yet uploaded by hosttools, and that may (or not) cause errors. Hence, place the folder # for the final models outside of "outputs", and upload manually. model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder # Copy all code from project and InnerEye into the model folder, and copy over checkpoints. # This increases the size of the data stored for the run. The other option would be to store all checkpoints # right in the final model folder - however, then that would also contain any other checkpoints that the model # produced or downloaded for recovery, bloating the final model file. self.copy_child_paths_to_folder(final_model_folder, checkpoint_paths) logging.info("Registering the model on the workspace.") if is_offline_run: model_description = model_description + f"\nModel built by {self.azure_config.build_user} outside AzureML" model = Model.register(workspace=workspace, model_name=self.model_config.model_name, model_path=str(final_model_folder), description=model_description) else: # This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model" artifacts_path = model_subfolder # If the present run is a child run of a Hyperdrive parent run, and we are building an ensemble model, # register it the model on the parent run. if PARENT_RUN_CONTEXT and model_proc == ModelProcessing.ENSEMBLE_CREATION: run_to_register_on = PARENT_RUN_CONTEXT logging.info( f"Registering the model on the parent run {run_to_register_on.id}" ) else: run_to_register_on = RUN_CONTEXT logging.info( f"Registering the model on the current run {run_to_register_on.id}" ) logging.info( f"Uploading files in {final_model_folder} with prefix '{artifacts_path}'" ) final_model_folder_relative = final_model_folder.relative_to( Path.cwd()) run_to_register_on.upload_folder( name=artifacts_path, path=str(final_model_folder_relative)) # When registering the model on the run, we need to provide a relative path inside of the run's output # folder in `model_path` model = run_to_register_on.register_model( model_name=self.model_config.model_name, model_path=artifacts_path, tags=RUN_CONTEXT.get_tags(), description=model_description) deployment_result = None logging.info( f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}" ) # update the run's tags with the registered model information if not is_offline_run: update_run_tags(RUN_CONTEXT, {MODEL_ID_KEY_NAME: model.id}) # create a version of the model for deployment if the hook is provided if self.model_deployment_hook is not None: assert isinstance(self.model_config, SegmentationModelBase) deployment_result = self.model_deployment_hook( self.model_config, self.azure_config, model, model_proc) return model, deployment_result
def register_segmentation_model(self, best_epoch: int, best_epoch_dice: float, checkpoint_paths: List[Path], model_proc: ModelProcessing, run: Optional[Run] = None, workspace: Optional[Workspace] = None, tags: Optional[Dict[str, str]] = None) -> \ Tuple[Optional[Model], Optional[Path], Any]: """ Registers a new model in the workspace's model registry to be deployed further, and creates a model zip for portal deployment (if required). This model, is the model checkpoint with the highest test accuracy. :param best_epoch: The training epoch that resulted in the highest validation score. :param best_epoch_dice: Dice metric for the best epoch :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML. :param model_proc: whether it's a single or ensemble model. :param run: If provided then the run's workspace and tags will be used to register the model. :param workspace: If provided, then this workspace will be used to register the model instead of the workspace associated with the provided run. :param tags: If provided, then these will be used instead of the tags found in the provided run. :returns AML model object, the path to the specially-deployed model if any, and a further object relating to model deployment; if model_deployment_hook is None, the last two are also None. However if a model cannot be registered because the run is an _OfflineRun, or the model_config is not for a segmentation model, None is returned instead of a model. """ if not isinstance(self.model_config, SegmentationModelBase): logging.warning("Non-segmentation models cannot be registered") return None, None, None if (run is None) == (workspace is None): raise ValueError( "Either a run or a workspace must be provided but not both") elif run: if not hasattr(run, 'experiment'): logging.warning( "Not registering a model, because the run has no associated experiment" ) return None, None, None workspace = run.experiment.workspace tags = run.get_tags() relative_checkpoint_paths = [ x.relative_to(self.project_root) if x.is_absolute() else x for x in checkpoint_paths ] model_inference_config = ModelInferenceConfig( model_name=self.model_config.model_name, structure_names=self.model_config.ground_truth_ids_display_names, colours=self.model_config.colours, fill_holes=self.model_config.fill_holes, model_configs_namespace=self.model_config.__class__.__module__, checkpoint_paths=list(map(str, relative_checkpoint_paths))) full_path_to_config = self.project_root / fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME full_path_to_config.write_text(model_inference_config.to_json(), encoding='utf-8') # type: ignore relative_child_paths = self.get_child_paths(checkpoint_paths) # Add experiment and run ID to tags if run is not None: tags = self.tags_with_run_information(run, tags) model = Model.register( workspace=workspace, model_path=str(self.project_root), child_paths=relative_child_paths, model_name=self.model_config.model_name, tags=tags, description="Best epoch: {}, Accuracy : {}".format( best_epoch, best_epoch_dice)) logging.info( f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}" ) # update the run's tags with the registered model information if not self.model_config.is_offline_run: update_run_tags(run, {MODEL_ID_KEY_NAME: model.id}) # create a version of the model for deployment if the hook is provided if self.model_deployment_hook is not None: assert isinstance(self.model_config, SegmentationModelBase) deployment_model_path, deployment_model_spec = self.model_deployment_hook( self.model_config, self.azure_config, model, model_proc) return model, deployment_model_path, deployment_model_spec return model, None, None