def set_run_tags_from_parent(self) -> None: """ Set metadata for the run """ assert PARENT_RUN_CONTEXT, "This function should only be called in a Hyperdrive run." run_tags_parent = PARENT_RUN_CONTEXT.get_tags() tags_to_copy = [ "tag", "model_name", "execution_mode", "recovered_from", "friendly_name", "build_number", "build_user", "source_repository", "source_branch", "source_id", "source_message", "source_author", "source_dirty", RUN_RECOVERY_FROM_ID_KEY_NAME ] new_tags = {tag: run_tags_parent.get(tag, "") for tag in tags_to_copy} new_tags[RUN_RECOVERY_ID_KEY_NAME] = create_run_recovery_id( run=RUN_CONTEXT) new_tags[CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY] = str( self.model_config.cross_validation_split_index) new_tags[EFFECTIVE_RANDOM_SEED_KEY_NAME] = str( self.model_config.get_effective_random_seed()) if isinstance(self.model_config, ScalarModelBase): new_tags[NUMBER_OF_CROSS_VALIDATION_SPLITS_PER_FOLD_KEY_NAME] = str( self.model_config.number_of_cross_validation_splits_per_fold) new_tags[CROSS_VALIDATION_SUB_FOLD_SPLIT_INDEX_TAG_KEY] = str( self.model_config.cross_validation_sub_fold_split_index) RUN_CONTEXT.set_tags(new_tags)
def generate_and_print_model_summary(config: ModelConfigBase, model: DeviceAwareModule) -> None: """ Writes a human readable summary of the present model to logging.info, and logs the number of trainable parameters to AzureML. :param config: The configuration for the model. :param model: The instantiated Pytorch model. """ random_state = RandomStateSnapshot.snapshot_random_state() # There appears to be a bug in apex, where previous use (in training for example) causes problems # when another model is later built on the CPU (for example, before loading from a checkpoint) # https://github.com/NVIDIA/apex/issues/694 # Hence, move the model to the GPU before doing model summary. if config.use_gpu: model = model.cuda() if isinstance(config, ScalarModelBase): # To generate the model summary, read the first item of the dataset. Then use the model's own # get_model_input function to convert the dataset item to input tensors, and feed them through the model. train_dataset = config.get_torch_dataset_for_inference(ModelExecutionMode.TRAIN) train_item_0 = next(iter(train_dataset.as_data_loader(shuffle=False, batch_size=1, num_dataload_workers=0))) model_inputs = get_scalar_model_inputs_and_labels(config, model, train_item_0).model_inputs # The model inputs may already be converted to float16, assuming that we would do mixed precision. # However, the model is not yet converted to float16 when this function is called, hence convert back to float32 summary = ModelSummary(model) summary.generate_summary(input_tensors=model_inputs, log_summaries_to_files=config.log_summaries_to_files) elif config.is_segmentation_model: summary_for_segmentation_models(config, model) assert model.summarizer summary = model.summarizer # type: ignore else: raise ValueError("Don't know how to generate a summary for this type of model?") RUN_CONTEXT.log(LoggingColumns.NumTrainableParameters, summary.n_trainable_params) random_state.restore_random_state()
def register_model(self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing) -> None: """ Registers the model in AzureML, with the given set of checkpoints. The AzureML run's tags are updated to describe with information about ensemble creation and the parent run ID. :param checkpoint_paths: The set of Pytorch checkpoints that should be included. :param model_description: A string description of the model, usually containing accuracy numbers. :param model_proc: The type of model that is registered (single or ensemble) """ if not checkpoint_paths: # No point continuing, since no checkpoints were found logging.warning("Abandoning model registration - no valid checkpoint paths found") return if not self.model_config.is_offline_run: split_index = RUN_CONTEXT.get_tags().get(CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None) if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: RUN_CONTEXT.tag(IS_ENSEMBLE_KEY_NAME, str(model_proc == ModelProcessing.ENSEMBLE_CREATION)) elif PARENT_RUN_CONTEXT is not None: RUN_CONTEXT.tag(PARENT_RUN_ID_KEY_NAME, str(PARENT_RUN_CONTEXT.id)) if isinstance(self.model_config, SegmentationModelBase): with logging_section(f"Registering {model_proc.value} model"): self.register_segmentation_model( checkpoint_paths=checkpoint_paths, model_description=model_description, model_proc=model_proc) else: logging.info(f"No deployment done for this type of model: {type(self.model_config)}")
def upload_output_file_as_temp(file_path: Path, outputs_folder: Path) -> None: """ Uploads a file to the AzureML run. It will get a name that is composed of a "temp/" prefix, plus the path of the file relative to the outputs folder that is used for training. :param file_path: The path of the file to upload. :param outputs_folder: The root folder that contains all training outputs. """ upload_name = TEMP_PREFIX + str(file_path.relative_to(outputs_folder)) RUN_CONTEXT.upload_file(upload_name, path_or_stream=str(file_path))
def log_image(self, name: str, path: str) -> None: """ Logs a PNG image stored in `path` to Azure and Tensorboard. """ if not is_offline_run_context(RUN_CONTEXT): RUN_CONTEXT.log_image(name=name, path=path) writer = self.tensorboard_logger img = Image.open(path).convert("RGB") img = np.transpose(np.asarray(img), (2, 0, 1)) writer.add_image(name, img, self.epoch)
def log_to_azure(self, label: str, metric: float) -> None: """ Logs a metric as a key/value pair to AzureML. """ if not is_offline_run_context(RUN_CONTEXT): metric_name = self.logging_prefix + label RUN_CONTEXT.log(metric_name, metric) # When running in a cross validation setting, log all metrics to the hyperdrive parent run too, # so that we can easily overlay graphs across runs. if self.log_to_parent_run and PARENT_RUN_CONTEXT: if self.cross_validation_split_index > DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: PARENT_RUN_CONTEXT.log(f"{metric_name}_Split{self.cross_validation_split_index}", metric)
def print_git_tags(self) -> None: """ When running in AzureML, print all the tags that contain information about the git repository status, for answering the question "which code version was used" from a log file only. """ git_tags = get_git_tags(self.azure_config) if is_offline_run_context(RUN_CONTEXT): # When running on a VM outside AzureML, we can read git information from the current repository tags_to_print = git_tags else: # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git # information into run tags after submitting the job, and can read it out here. # Only print out those tags that were created from git-related information tags_to_print = {key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags} logging.info("Git repository information:") for key, value in tags_to_print.items(): logging.info(f" {key:20}: {value}")
def register_segmentation_model( self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing ) -> Tuple[Optional[Model], Optional[Any]]: """ Registers a new model in the workspace's model registry to be deployed further, and creates a model zip for portal deployment (if required). This model is the model checkpoint with the highest test accuracy. :param model_description: A string description that is added to the deployed model. It would usually contain the test set performance and information at which epoch the result was achieved. :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML. :param model_proc: whether it's a single or ensemble model. :returns Tuple element 1: AML model object, or None if no model could be registered. Tuple element 2: The result of running the model_deployment_hook, or None if no hook was supplied. """ is_offline_run = is_offline_run_context(RUN_CONTEXT) workspace = None # Terminate early if this is running outside AzureML, and we can't access the AzureML workspace. This # saves time copying around files. if is_offline_run: try: workspace = self.azure_config.get_workspace() except Exception: logging.warning( "Unable to retrieve AzureML workspace. Was the Azure setup completed?" ) logging.info("No model was registered in AzureML.") return None, None # The files for the final model can't live in the outputs folder. If they do: when registering the model, # the files may not yet uploaded by hosttools, and that may (or not) cause errors. Hence, place the folder # for the final models outside of "outputs", and upload manually. model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder # Copy all code from project and InnerEye into the model folder, and copy over checkpoints. # This increases the size of the data stored for the run. The other option would be to store all checkpoints # right in the final model folder - however, then that would also contain any other checkpoints that the model # produced or downloaded for recovery, bloating the final model file. self.copy_child_paths_to_folder(final_model_folder, checkpoint_paths) logging.info("Registering the model on the workspace.") if is_offline_run: model_description = model_description + f"\nModel built by {self.azure_config.build_user} outside AzureML" model = Model.register(workspace=workspace, model_name=self.model_config.model_name, model_path=str(final_model_folder), description=model_description) else: # This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model" artifacts_path = model_subfolder # If the present run is a child run of a Hyperdrive parent run, and we are building an ensemble model, # register it the model on the parent run. if PARENT_RUN_CONTEXT and model_proc == ModelProcessing.ENSEMBLE_CREATION: run_to_register_on = PARENT_RUN_CONTEXT logging.info( f"Registering the model on the parent run {run_to_register_on.id}" ) else: run_to_register_on = RUN_CONTEXT logging.info( f"Registering the model on the current run {run_to_register_on.id}" ) logging.info( f"Uploading files in {final_model_folder} with prefix '{artifacts_path}'" ) final_model_folder_relative = final_model_folder.relative_to( Path.cwd()) run_to_register_on.upload_folder( name=artifacts_path, path=str(final_model_folder_relative)) # When registering the model on the run, we need to provide a relative path inside of the run's output # folder in `model_path` model = run_to_register_on.register_model( model_name=self.model_config.model_name, model_path=artifacts_path, tags=RUN_CONTEXT.get_tags(), description=model_description) deployment_result = None logging.info( f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}" ) # update the run's tags with the registered model information if not is_offline_run: update_run_tags(RUN_CONTEXT, {MODEL_ID_KEY_NAME: model.id}) # create a version of the model for deployment if the hook is provided if self.model_deployment_hook is not None: assert isinstance(self.model_config, SegmentationModelBase) deployment_result = self.model_deployment_hook( self.model_config, self.azure_config, model, model_proc) return model, deployment_result
def run(self) -> None: """ Driver function to run a ML experiment. If an offline cross validation run is requested, then this function is recursively called for each cross validation split. """ if self.is_offline_cross_val_parent_run(): if self.model_config.is_segmentation_model: raise NotImplementedError( "Offline cross validation is only supported for classification models." ) self.spawn_offline_cross_val_classification_child_runs() return # Get the AzureML context in which the script is running if not self.model_config.is_offline_run and PARENT_RUN_CONTEXT is not None: logging.info("Setting tags from parent run.") self.set_run_tags_from_parent() self.save_build_info_for_dotnet_consumers() # Set data loader start method self.set_multiprocessing_start_method() # configure recovery container if provided checkpoint_handler = CheckpointHandler(model_config=self.model_config, azure_config=self.azure_config, project_root=self.project_root, run_context=RUN_CONTEXT) checkpoint_handler.discover_and_download_checkpoints_from_previous_runs( ) # do training and inference, unless the "only register" switch is set (which requires a run_recovery # to be valid). if not self.azure_config.register_model_only_for_epoch: # Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails # and config.local_dataset was not already set. self.model_config.local_dataset = self.mount_or_download_dataset() self.model_config.write_args_file() logging.info(str(self.model_config)) # Ensure that training runs are fully reproducible - setting random seeds alone is not enough! make_pytorch_reproducible() # Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been # loaded (typically only during tests) if self.model_config.dataset_data_frame is None: assert self.model_config.local_dataset is not None ml_util.validate_dataset_paths(self.model_config.local_dataset) # train a new model if required if self.azure_config.train: with logging_section("Model training"): model_train(self.model_config, checkpoint_handler) else: self.model_config.write_dataset_files() self.create_activation_maps() # log the number of epochs used for model training RUN_CONTEXT.log(name="Train epochs", value=self.model_config.num_epochs) # We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because # the current run is a single one. See the documentation of ModelProcessing for more details. best_epoch = self.run_inference_and_register_model( checkpoint_handler, ModelProcessing.DEFAULT) # Generate report if best_epoch: Runner.generate_report(self.model_config, best_epoch, ModelProcessing.DEFAULT) elif self.model_config.is_scalar_model and len( self.model_config.get_test_epochs()) == 1: # We don't register scalar models but still want to create a report if we have run inference. Runner.generate_report(self.model_config, self.model_config.get_test_epochs()[0], ModelProcessing.DEFAULT)
def model_train(config: ModelConfigBase, checkpoint_handler: CheckpointHandler) -> ModelTrainingResults: """ The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :raises TypeError: If the arguments are of the wrong type. :raises ValueError: When there are issues loading a previous checkpoint. """ # Save the dataset files for later use in cross validation analysis config.write_dataset_files() # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization") # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't # want training to depend on how many patients we visualized, and hence set the random seed again right after. with logging_section( "Visualizing the effect of sampling random crops for training"): visualize_random_crops_for_dataset(config) ml_util.set_random_seed(config.get_effective_random_seed(), "Model training") logging.debug("Creating the PyTorch model.") # Create the train loader and validation loader to load images from the dataset data_loaders = config.create_data_loaders() # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() models_and_optimizer = ModelAndInfo( config=config, model_execution_mode=ModelExecutionMode.TRAIN, checkpoint_path=checkpoint_path) # Create the main model # If continuing from a previous run at a specific epoch, then load the previous model. model_loaded = models_and_optimizer.try_create_model_and_load_from_checkpoint( ) if not model_loaded: raise ValueError( "There was no checkpoint file available for the model for given start_epoch {}" .format(config.start_epoch)) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, models_and_optimizer.model) # Move model to GPU and adjust for multiple GPUs models_and_optimizer.adjust_model_for_gpus() # Create the mean teacher model and move to GPU if config.compute_mean_teacher_model: mean_teacher_model_loaded = models_and_optimizer.try_create_mean_teacher_model_load_from_checkpoint_and_adjust( ) if not mean_teacher_model_loaded: raise ValueError( "There was no checkpoint file available for the mean teacher model " f"for given start_epoch {config.start_epoch}") # Create optimizer models_and_optimizer.create_optimizer() if checkpoint_handler.should_load_optimizer_checkpoint(): optimizer_loaded = models_and_optimizer.try_load_checkpoint_for_optimizer( ) if not optimizer_loaded: raise ValueError( f"There was no checkpoint file available for the optimizer for given start_epoch " f"{config.start_epoch}") # Create checkpoint directory for this run if it doesn't already exist logging.info(f"Models are saved at {config.checkpoint_folder}") if not config.checkpoint_folder.is_dir(): config.checkpoint_folder.mkdir() # Create the SummaryWriters for Tensorboard writers = create_summary_writers(config) config.create_dataframe_loggers() # Create LR scheduler l_rate_scheduler = SchedulerWithWarmUp(config, models_and_optimizer.optimizer) # Training loop logging.info("Starting training") train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], [] resource_monitor = None if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring diagnostics_events = config.logs_folder / "diagnostics" logging.info( f"Starting resource monitor, outputting to {diagnostics_events}") resource_monitor = ResourceMonitor( interval_seconds=config.monitoring_interval_seconds, tensorboard_folder=diagnostics_events) resource_monitor.start() gradient_scaler = GradScaler( ) if config.use_gpu and config.use_mixed_precision else None optimal_temperature_scale_values = [] for epoch in config.get_train_epochs(): logging.info("Starting epoch {}".format(epoch)) save_epoch = config.should_save_epoch( epoch) and models_and_optimizer.optimizer is not None # store the learning rates used for each epoch epoch_lrs = l_rate_scheduler.get_last_lr() learning_rates_per_epoch.append(epoch_lrs) train_val_params: TrainValidateParameters = \ TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN], model=models_and_optimizer.model, mean_teacher_model=models_and_optimizer.mean_teacher_model, epoch=epoch, optimizer=models_and_optimizer.optimizer, gradient_scaler=gradient_scaler, epoch_learning_rate=epoch_lrs, summary_writers=writers, dataframe_loggers=config.metrics_data_frame_loggers, in_training_mode=True) training_steps = create_model_training_steps(config, train_val_params) train_epoch_results = train_or_validate_epoch(training_steps) train_results_per_epoch.append(train_epoch_results.metrics) metrics.validate_and_store_model_parameters(writers.train, epoch, models_and_optimizer.model) # Run without adjusting weights on the validation set train_val_params.in_training_mode = False train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL] # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs # as these will be re-computed after performing temperature scaling on the validation set. if isinstance(config, SequenceModelBase): train_val_params.save_metrics = not ( save_epoch and config.temperature_scaling_config) training_steps = create_model_training_steps(config, train_val_params) val_epoch_results = train_or_validate_epoch(training_steps) val_results_per_epoch.append(val_epoch_results.metrics) if config.is_segmentation_model: metrics.store_epoch_stats_for_segmentation( config.outputs_folder, epoch, epoch_lrs, train_epoch_results.metrics, val_epoch_results.metrics) if save_epoch: # perform temperature scaling if required if isinstance( config, SequenceModelBase) and config.temperature_scaling_config: optimal_temperature, scaled_val_results = \ temperature_scaling_steps(config, train_val_params, val_epoch_results) optimal_temperature_scale_values.append(optimal_temperature) # overwrite the metrics for the epoch with the metrics from the temperature scaled model val_results_per_epoch[-1] = scaled_val_results.metrics models_and_optimizer.save_checkpoint(epoch) # Updating the learning rate should happen at the end of the training loop, so that the # initial learning rate will be used for the very first epoch. l_rate_scheduler.step() model_training_results = ModelTrainingResults( train_results_per_epoch=train_results_per_epoch, val_results_per_epoch=val_results_per_epoch, learning_rates_per_epoch=learning_rates_per_epoch, optimal_temperature_scale_values_per_checkpoint_epoch= optimal_temperature_scale_values) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) writers.close_all() config.metrics_data_frame_loggers.close_all() if resource_monitor: # stop the resource monitoring process logging.info( "Shutting down the resource monitor process. Aggregate resource utilization:" ) for name, value in resource_monitor.read_aggregate_metrics(): logging.info(f"{name}: {value}") if not is_offline_run_context(RUN_CONTEXT): RUN_CONTEXT.log(name, value) resource_monitor.kill() return model_training_results
def register_segmentation_model(self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing) -> Tuple[Model, Any]: """ Registers a new model in the workspace's model registry to be deployed further, and creates a model zip for portal deployment (if required). :param model_description: A string description that is added to the deployed model. It would usually contain the test set performance and information at which epoch the result was achieved. :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML. :param model_proc: whether it's a single or ensemble model. :returns Tuple element 1: AML model object, or None if no model could be registered. Tuple element 2: The result of running the model_deployment_hook, or None if no hook was supplied. """ # The files for the final model can't live in the outputs folder. If they do: when registering the model, # the files may not yet uploaded by hosttools, and that may (or not) cause errors. Hence, place the folder # for the final models outside of "outputs", and upload manually. model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER # This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model" artifacts_path = model_subfolder final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder # Copy all code from project and InnerEye into the model folder, and copy over checkpoints. # This increases the size of the data stored for the run. The other option would be to store all checkpoints # right in the final model folder - however, then that would also contain any other checkpoints that the model # produced or downloaded for recovery, bloating the final model file. self.copy_child_paths_to_folder(final_model_folder, checkpoint_paths) # If the present run is a child run of a Hyperdrive parent run, and we are building an ensemble model, # register it the model on the parent run. if PARENT_RUN_CONTEXT and model_proc == ModelProcessing.ENSEMBLE_CREATION: run_to_register_on = PARENT_RUN_CONTEXT logging.info(f"Registering the model on the parent run {run_to_register_on.id}") else: run_to_register_on = RUN_CONTEXT logging.info(f"Registering the model on the current run {run_to_register_on.id}") logging.info(f"Uploading files in {final_model_folder} with prefix '{artifacts_path}'") final_model_folder_relative = final_model_folder.relative_to(Path.cwd()) run_to_register_on.upload_folder(name=artifacts_path, path=str(final_model_folder_relative)) # When registering the model on the run, we need to provide a relative path inside of the run's output # folder in `model_path` model = run_to_register_on.register_model( model_name=self.model_config.model_name, model_path=artifacts_path, tags=RUN_CONTEXT.get_tags(), description=model_description ) # Add the name of the Python environment as a model tag, because we need it when running inference # on the model. We could add that as an immutable property, but with tags we have the option to modify # to a custom environment later. python_environment = RUN_CONTEXT.get_environment() assert python_environment.version == ENVIRONMENT_VERSION, \ f"Expected all Python environments to have version '{ENVIRONMENT_VERSION}', but got: " \ f"'{python_environment.version}" model.add_tags({PYTHON_ENVIRONMENT_NAME: python_environment.name}) # update the run's tags with the registered model information run_to_register_on.tag(MODEL_ID_KEY_NAME, model.id) deployment_result = None logging.info(f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}") # create a version of the model for deployment if the hook is provided if self.model_deployment_hook is not None: assert isinstance(self.model_config, SegmentationModelBase) deployment_result = self.model_deployment_hook( self.model_config, self.azure_config, model, model_proc) return model, deployment_result
def run(self) -> None: """ Driver function to run a ML experiment. If an offline cross validation run is requested, then this function is recursively called for each cross validation split. """ if self.is_offline_cross_val_parent_run(): if self.model_config.is_segmentation_model: raise NotImplementedError("Offline cross validation is only supported for classification models.") self.spawn_offline_cross_val_classification_child_runs() return # Get the AzureML context in which the script is running if not self.model_config.is_offline_run and PARENT_RUN_CONTEXT is not None: logging.info("Setting tags from parent run.") self.set_run_tags_from_parent() self.save_build_info_for_dotnet_consumers() # Set data loader start method self.set_multiprocessing_start_method() # configure recovery container if provided checkpoint_handler = CheckpointHandler(model_config=self.model_config, azure_config=self.azure_config, project_root=self.project_root, run_context=RUN_CONTEXT) checkpoint_handler.download_recovery_checkpoints_or_weights() # do training and inference, unless the "only register" switch is set (which requires a run_recovery # to be valid). if not self.azure_config.only_register_model: # Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails # and config.local_dataset was not already set. self.model_config.local_dataset = self.mount_or_download_dataset() # Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been # loaded (typically only during tests) if self.model_config.dataset_data_frame is None: assert self.model_config.local_dataset is not None ml_util.validate_dataset_paths( self.model_config.local_dataset, self.model_config.dataset_csv) # train a new model if required if self.azure_config.train: with logging_section("Model training"): model_train(self.model_config, checkpoint_handler, num_nodes=self.azure_config.num_nodes) else: self.model_config.write_dataset_files() self.create_activation_maps() # log the number of epochs used for model training RUN_CONTEXT.log(name="Train epochs", value=self.model_config.num_epochs) # We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because # the current run is a single one. See the documentation of ModelProcessing for more details. self.run_inference_and_register_model(checkpoint_handler, ModelProcessing.DEFAULT) if self.model_config.generate_report: self.generate_report(ModelProcessing.DEFAULT) # If this is an cross validation run, and the present run is child run 0, then wait for the sibling runs, # build the ensemble model, and write a report for that. if self.model_config.number_of_cross_validation_splits > 0: if self.model_config.should_wait_for_other_cross_val_child_runs(): self.wait_for_runs_to_finish() self.create_ensemble_model()
def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = None) -> ModelTrainingResults: """ The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param run_recovery: Recovery information to restart training from an existing run. :raises TypeError: If the arguments are of the wrong type. :raises ValueError: When there are issues loading a previous checkpoint. """ # Save the dataset files for later use in cross validation analysis config.write_dataset_files() # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Model Training") logging.debug("Creating the PyTorch model.") # Create the train loader and validation loader to load images from the dataset data_loaders = config.create_data_loaders() # Create models, optimizers, and whether is_mean_teacher checkpoint_path = get_recovery_path_train(run_recovery=run_recovery, is_mean_teacher=False, epoch=config.start_epoch) models_and_optimizers = [ModelAndInfo(config=config, model_execution_mode=ModelExecutionMode.TRAIN, is_mean_teacher=False, checkpoint_path=checkpoint_path if config.should_load_checkpoint_for_training() else None)] if config.compute_mean_teacher_model: checkpoint_path = get_recovery_path_train(run_recovery=run_recovery, is_mean_teacher=True, epoch=config.start_epoch) models_and_optimizers.append(ModelAndInfo(config=config, model_execution_mode=ModelExecutionMode.TRAIN, is_mean_teacher=True, checkpoint_path=checkpoint_path if config.should_load_checkpoint_for_training() else None)) # Create the models. # If continuing from a previous run at a specific epoch, then load the previous model. for model_and_info in models_and_optimizers: model_loaded = model_and_info.try_create_model_and_load_from_checkpoint() if not model_loaded: raise ValueError("There was no checkpoint file available for the model for given start_epoch {}" .format(config.start_epoch)) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, models_and_optimizers[0].model) # Move model to GPU and adjust for multiple GPUs models_and_optimizers[0].adjust_model_for_gpus() if len(models_and_optimizers) > 1: models_and_optimizers[1].create_summary_and_adjust_model_for_gpus() # Create optimizer optimizer_loaded = models_and_optimizers[0].try_create_optimizer_and_load_from_checkpoint() if not optimizer_loaded: raise ValueError("There was no checkpoint file available for the optimizer for given start_epoch {}" .format(config.start_epoch)) # Create checkpoint directory for this run if it doesn't already exist logging.info("Models are saved at {}".format(config.checkpoint_folder)) if not os.path.isdir(config.checkpoint_folder): os.makedirs(config.checkpoint_folder) # Create the SummaryWriters for Tensorboard writers = create_summary_writers(config) config.create_dataframe_loggers() model = models_and_optimizers[0].model optimizer = models_and_optimizers[0].optimizer mean_teacher_model = models_and_optimizers[1].model if len(models_and_optimizers) > 1 else None # Create LR scheduler l_rate_scheduler = SchedulerWithWarmUp(config, optimizer) # Training loop logging.info("Starting training") train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], [] resource_monitor = None if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds, tb_log_file_path=str(config.logs_folder / "diagnostics")) resource_monitor.start() gradient_scaler = GradScaler() if config.use_gpu and config.use_mixed_precision else None optimal_temperature_scale_values = [] for epoch in config.get_train_epochs(): logging.info("Starting epoch {}".format(epoch)) save_epoch = config.should_save_epoch(epoch) and optimizer is not None # store the learning rates used for each epoch epoch_lrs = l_rate_scheduler.get_last_lr() learning_rates_per_epoch.append(epoch_lrs) train_val_params: TrainValidateParameters = \ TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN], model=model, mean_teacher_model=mean_teacher_model, epoch=epoch, optimizer=optimizer, gradient_scaler=gradient_scaler, epoch_learning_rate=epoch_lrs, summary_writers=writers, dataframe_loggers=config.metrics_data_frame_loggers, in_training_mode=True) training_steps = create_model_training_steps(config, train_val_params) train_epoch_results = train_or_validate_epoch(training_steps) train_results_per_epoch.append(train_epoch_results.metrics) metrics.validate_and_store_model_parameters(writers.train, epoch, model) # Run without adjusting weights on the validation set train_val_params.in_training_mode = False train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL] # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs # as these will be re-computed after performing temperature scaling on the validation set. if isinstance(config, SequenceModelBase): train_val_params.save_metrics = not (save_epoch and config.temperature_scaling_config) training_steps = create_model_training_steps(config, train_val_params) val_epoch_results = train_or_validate_epoch(training_steps) val_results_per_epoch.append(val_epoch_results.metrics) if config.is_segmentation_model: metrics.store_epoch_stats_for_segmentation(config.outputs_folder, epoch, epoch_lrs, train_epoch_results.metrics, val_epoch_results.metrics) if save_epoch: # perform temperature scaling if required if isinstance(config, SequenceModelBase) and config.temperature_scaling_config: optimal_temperature, scaled_val_results = \ temperature_scaling_steps(config, train_val_params, val_epoch_results) optimal_temperature_scale_values.append(optimal_temperature) # overwrite the metrics for the epoch with the metrics from the temperature scaled model val_results_per_epoch[-1] = scaled_val_results.metrics assert optimizer is not None save_checkpoint(model, optimizer, epoch, config) if config.compute_mean_teacher_model: assert mean_teacher_model is not None save_checkpoint(mean_teacher_model, optimizer, epoch, config, mean_teacher_model=True) # Updating the learning rate should happen at the end of the training loop, so that the # initial learning rate will be used for the very first epoch. l_rate_scheduler.step() model_training_results = ModelTrainingResults( train_results_per_epoch=train_results_per_epoch, val_results_per_epoch=val_results_per_epoch, learning_rates_per_epoch=learning_rates_per_epoch, optimal_temperature_scale_values_per_checkpoint_epoch=optimal_temperature_scale_values ) logging.info("Finished training") # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) writers.close_all() config.metrics_data_frame_loggers.close_all() if resource_monitor: # stop the resource monitoring process resource_monitor.kill() return model_training_results
def model_train(checkpoint_handler: CheckpointHandler, container: LightningContainer, num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]: """ The main training loop. It creates the Pytorch model based on the configuration options passed in, creates a Pytorch Lightning trainer, and trains the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :param num_nodes: The number of nodes to use in distributed training. :param container: A container object that holds the training data in PyTorch Lightning format and the model to train. :return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when fitting other models. """ # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() lightning_model = container.model resource_monitor: Optional[ResourceMonitor] = None # Execute some bookkeeping tasks only once if running distributed: if is_global_rank_zero(): logging.info( f"Model checkpoints are saved at {container.checkpoint_folder}") write_args_file(container.config if isinstance( container, InnerEyeContainer) else container, outputs_folder=container.outputs_folder) if container.monitoring_interval_seconds > 0: resource_monitor = start_resource_monitor(container) # Run all of the container-related operations consistently with changed outputs folder, even ones that # should not rely on the current working directory, like get_data_module. with change_working_directory(container.outputs_folder): data_module = container.get_data_module() if is_global_rank_zero(): container.before_training_on_global_rank_zero() if is_local_rank_zero(): container.before_training_on_local_rank_zero() container.before_training_on_all_ranks() # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second # training in the unit tests.d old_environ = dict(os.environ) # Set random seeds just before training. For segmentation models, we have # something that changes the random seed in the before_training_on_rank_zero hook. seed_everything(container.get_effective_random_seed()) trainer, storing_logger = create_lightning_trainer( container, checkpoint_path, num_nodes=num_nodes, **container.get_trainer_arguments()) rank_info = ", ".join( f"{env}: {os.getenv(env)}" for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK]) logging.info( f"Environment variables: {rank_info}. trainer.global_rank: {trainer.global_rank}" ) # InnerEye models use this logger for diagnostics if isinstance(lightning_model, InnerEyeLightning): if storing_logger is None: raise ValueError( "InnerEye models require the storing_logger for diagnostics") lightning_model.storing_logger = storing_logger logging.info("Starting training") # When training models that are not built-in InnerEye models, we have no guarantee that they write # files to the right folder. Best guess is to change the current working directory to where files should go. with change_working_directory(container.outputs_folder): trainer.fit(lightning_model, datamodule=data_module) trainer.logger.close() # type: ignore world_size = getattr(trainer, "world_size", 0) is_azureml_run = not is_offline_run_context(RUN_CONTEXT) # Per-subject model outputs for regression models are written per rank, and need to be aggregated here. # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them. if is_azureml_run and world_size > 1 and isinstance( lightning_model, ScalarLightning): upload_output_file_as_temp( lightning_model.train_subject_outputs_logger.csv_path, container.outputs_folder) upload_output_file_as_temp( lightning_model.val_subject_outputs_logger.csv_path, container.outputs_folder) # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training. # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set # all necessary properties. if lightning_model.global_rank != 0: logging.info( f"Terminating training thread with rank {lightning_model.global_rank}." ) sys.exit() logging.info("Choosing the best checkpoint and removing redundant files.") create_best_checkpoint(container.checkpoint_folder) # Lightning modifies a ton of environment variables. If we first run training and then the test suite, # those environment variables will mislead the training runs in the test suite, and make them crash. # Hence, restore the original environment after training. os.environ.clear() os.environ.update(old_environ) if world_size and isinstance(lightning_model, ScalarLightning): if is_azureml_run and world_size > 1: # In a DDP run on the local box, all ranks will write to local disk, hence no download needed. # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and # concatenate for rank in range(world_size): for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: file = mode.value + "/" + get_subject_output_file_per_rank( rank) RUN_CONTEXT.download_file( name=TEMP_PREFIX + file, output_file_path=container.outputs_folder / file) # Concatenate all temporary file per execution mode aggregate_and_create_subject_metrics_file(container.outputs_folder) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it in the Azure UI. if isinstance(container, InnerEyeContainer): if container.config.max_batch_grad_cam > 0 and container.visualization_folder.exists( ): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(container.visualization_folder)) if resource_monitor: logging.info("Shutting down the resource monitor process.") if is_azureml_run: for gpu_name, metrics_per_gpu in resource_monitor.read_aggregate_metrics( ).items(): # Log as a table, with GPU being the first column RUN_CONTEXT.log_row("GPU utilization", GPU=gpu_name, **metrics_per_gpu) resource_monitor.kill() return trainer, storing_logger
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: if self.is_azureml_run: for key, value in metrics.items(): RUN_CONTEXT.log(key, value)
def model_train(config: ModelConfigBase, checkpoint_handler: CheckpointHandler, num_nodes: int = 1) -> ModelTrainingResults: """ The main training loop. It creates the Pytorch model based on the configuration options passed in, creates a Pytorch Lightning trainer, and trains the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :param num_nodes: The number of nodes to use in distributed training. """ # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() # This reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder # for categorical features, that need to be available before creating the model. config.read_dataset_if_needed() # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second # training in the unit tests.d old_environ = dict(os.environ) seed_everything(config.get_effective_random_seed()) trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes) logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. " f"trainer.global_rank: {trainer.global_rank}") logging.debug("Creating the PyTorch model.") lightning_model = create_lightning_model(config) lightning_model.storing_logger = storing_logger resource_monitor = None # Execute some bookkeeping tasks only once if running distributed: if is_rank_zero(): config.write_args_file() logging.info(str(config)) # Save the dataset files for later use in cross validation analysis config.write_dataset_files() logging.info(f"Model checkpoints are saved at {config.checkpoint_folder}") # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization") # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't # want training to depend on how many patients we visualized, and hence set the random seed again right after. with logging_section("Visualizing the effect of sampling random crops for training"): visualize_random_crops_for_dataset(config) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, lightning_model.model) if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring diagnostics_events = config.logs_folder / "diagnostics" logging.info(f"Starting resource monitor, outputting to {diagnostics_events}") resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds, tensorboard_folder=diagnostics_events) resource_monitor.start() # Training loop logging.info("Starting training") lightning_data = TrainingAndValidationDataLightning(config) # type: ignore # When trying to store the config object in the constructor, it does not appear to get stored at all, later # reference of the object simply fail. Hence, have to set explicitly here. lightning_data.config = config trainer.fit(lightning_model, datamodule=lightning_data) trainer.logger.close() # type: ignore lightning_model.close_all_loggers() world_size = getattr(trainer, "world_size", 0) is_azureml_run = not config.is_offline_run # Per-subject model outputs for regression models are written per rank, and need to be aggregated here. # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them. if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning): upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, config.outputs_folder) upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, config.outputs_folder) # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training. # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set # all necessary properties. if lightning_model.global_rank != 0: logging.info(f"Terminating training thread with rank {lightning_model.global_rank}.") sys.exit() logging.info("Choosing the best checkpoint and removing redundant files.") cleanup_checkpoint_folder(config.checkpoint_folder) # Lightning modifies a ton of environment variables. If we first run training and then the test suite, # those environment variables will mislead the training runs in the test suite, and make them crash. # Hence, restore the original environment after training. os.environ.clear() os.environ.update(old_environ) if world_size and isinstance(lightning_model, ScalarLightning): if is_azureml_run and world_size > 1: # In a DDP run on the local box, all ranks will write to local disk, hence no download needed. # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and # concatenate for rank in range(world_size): for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: file = mode.value + "/" + get_subject_output_file_per_rank(rank) RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=config.outputs_folder / file) # Concatenate all temporary file per execution mode for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: temp_files = (config.outputs_folder / mode.value).rglob(SUBJECT_OUTPUT_PER_RANK_PREFIX + "*") result_file = config.outputs_folder / mode.value / SUBJECT_METRICS_FILE_NAME for i, file in enumerate(temp_files): temp_file_contents = file.read_text() if i == 0: # Copy the first file as-is, including the first line with the column headers result_file.write_text(temp_file_contents) else: # For all files but the first one, cut off the header line. result_file.write_text(os.linesep.join(temp_file_contents.splitlines()[1:])) model_training_results = ModelTrainingResults( train_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values()), val_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values()), train_diagnostics=lightning_model.train_diagnostics, val_diagnostics=lightning_model.val_diagnostics, optimal_temperature_scale_values_per_checkpoint_epoch=[] ) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) if resource_monitor: # stop the resource monitoring process logging.info("Shutting down the resource monitor process. Aggregate resource utilization:") for name, value in resource_monitor.read_aggregate_metrics(): logging.info(f"{name}: {value}") if not config.is_offline_run: RUN_CONTEXT.log(name, value) resource_monitor.kill() return model_training_results