def model_inference_train_and_test(self, checkpoint_handler: CheckpointHandler, model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> \ Tuple[Optional[InferenceMetrics], Optional[InferenceMetrics], Optional[InferenceMetrics]]: train_metrics = None val_metrics = None test_metrics = None config = self.model_config def run_model_test(data_split: ModelExecutionMode) -> Optional[InferenceMetrics]: return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler, model_proc=model_proc) if config.perform_validation_and_test_set_inference: # perform inference on test set test_metrics = run_model_test(ModelExecutionMode.TEST) # perform inference on validation set val_metrics = run_model_test(ModelExecutionMode.VAL) if config.perform_training_set_inference: # perform inference on training set if required train_metrics = run_model_test(ModelExecutionMode.TRAIN) # log the metrics to AzureML experiment if possible. When doing ensemble runs, log to the Hyperdrive parent run, # so that we get the metrics of child run 0 and the ensemble separated. if config.is_segmentation_model and not is_offline_run_context(RUN_CONTEXT): run_for_logging = PARENT_RUN_CONTEXT if model_proc.ENSEMBLE_CREATION else RUN_CONTEXT log_metrics(val_metrics=val_metrics, test_metrics=test_metrics, # type: ignore train_metrics=train_metrics, run_context=run_for_logging) # type: ignore return test_metrics, val_metrics, train_metrics
def get_workspace(self) -> Workspace: """ Return a workspace object for an existing Azure Machine Learning Workspace (or default from YAML). When running inside AzureML, the workspace that is retrieved is always the one in the current run context. When running outside AzureML, it is created or accessed with the service principal. This function will read the workspace only in the first call to this method, subsequent calls will return a cached value. Throws an exception if the workspace doesn't exist or the required fields don't lead to a uniquely identifiable workspace. :return: Azure Machine Learning Workspace """ if self._workspace: return self._workspace run_context = Run.get_context() if is_offline_run_context(run_context): if self.subscription_id and self.resource_group: service_principal_auth = self.get_service_principal_auth() self._workspace = Workspace.get( name=self.workspace_name, auth=service_principal_auth, subscription_id=self.subscription_id, resource_group=self.resource_group) else: raise ValueError( "The values for 'subscription_id' and 'resource_group' were not found. " "Was the Azure setup completed?") else: self._workspace = run_context.experiment.workspace return self._workspace
def mount_or_download_dataset(self) -> Path: """ Makes the dataset that the model uses available on the executing machine. If the present training run is outside of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure into the local repository, in the "datasets" folder. If the training run is inside of AzureML, the dataset that was specified at job submission time will be mounted or downloaded. Returns the path of the dataset on the executing machine. """ azure_dataset_id = self.model_config.azure_dataset_id if is_offline_run_context(RUN_CONTEXT): # The present run is outside of AzureML: If local_dataset is set, use that as the path to the data. # Otherwise, download the dataset specified by the azure_dataset_id local_dataset = self.model_config.local_dataset if (not azure_dataset_id) and (local_dataset is None): raise ValueError("The model must contain either local_dataset or azure_dataset_id.") if local_dataset: expected_dir = Path(local_dataset) if not expected_dir.is_dir(): raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.") logging.info(f"Model training will use the local dataset provided in {expected_dir}") return expected_dir return download_dataset(azure_dataset_id=azure_dataset_id, target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME, azure_config=self.azure_config) # Inside of AzureML, datasets can be either mounted or downloaded. if not azure_dataset_id: raise ValueError("The model must contain azure_dataset_id for running on AML") mounted = try_to_mount_input_dataset(RUN_CONTEXT) if not mounted: raise ValueError("Unable to mount or download input dataset.") return mounted
def download_or_get_local_file(self, run: Optional[Run], blob_to_download: PathOrString, destination: Path, local_src_subdir: Optional[Path] = None) -> Optional[Path]: """ Downloads a file from the results folder of an AzureML run, or copies it from a local results folder. Returns the path to the downloaded file if it exists, or None if the file was not found. If the blobs_path contains folders, the same folder structure will be created inside the destination folder. For example, downloading "foo.txt" to "/c/temp" will create "/c/temp/foo.txt". Downloading "foo/bar.txt" to "/c/temp" will create "/c/temp/foo/bar.txt" :param blob_to_download: path of data to download within the run :param destination: directory to write to :param run: The AzureML run to download from. :param local_src_subdir: if not None, then if we copy from a local results folder, that folder is self.outputs_directory/local_src_subdir/blob_to_download instead of self.outputs_directory/blob_to_download :return: The path to the downloaded file, or None if the file was not found. """ blob_path = Path(blob_to_download) blob_parent = blob_path.parent if blob_parent != Path("."): destination = destination / blob_parent downloaded_file = destination / blob_path.name # If we've already downloaded the data, leave it as it is if downloaded_file.exists(): logging.info(f"Download of '{blob_path}' to '{downloaded_file}: not needed, already exists'") return downloaded_file logging.info(f"Download of '{blob_path}' to '{downloaded_file}': proceeding") # If the provided run is the current run, then there is nothing to download. # Just copy the provided path in the outputs directory to the destination. if not destination.exists(): destination.mkdir(parents=True) if run is None or Run.get_context().id == run.id or is_parent_run(run) or is_offline_run_context(run): if run is None: assert self.local_run_results is not None, "Local run results must be set in unit testing" local_src = Path(self.local_run_results) if self.local_run_result_split_suffix: local_src = local_src / self.local_run_result_split_suffix else: local_src = Path(self.outputs_directory) if local_src_subdir is not None: local_src = local_src / local_src_subdir local_src = local_src / blob_path if local_src.exists(): logging.info(f"Copying files from {local_src} to {destination}") return Path(shutil.copy(local_src, destination)) return None else: try: return download_outputs_from_run( blobs_path=blob_path, destination=destination, run=run, is_file=True ) except Exception as ex: logging.warning(f"File {blob_to_download} not found in output of run {run.id}: {ex}") return None
def log_image(self, name: str, path: str) -> None: """ Logs a PNG image stored in `path` to Azure and Tensorboard. """ if not is_offline_run_context(RUN_CONTEXT): RUN_CONTEXT.log_image(name=name, path=path) writer = self.tensorboard_logger img = Image.open(path).convert("RGB") img = np.transpose(np.asarray(img), (2, 0, 1)) writer.add_image(name, img, self.epoch)
def compare_folder_contents(expected_folder: Path, csv_relative_tolerance: float, actual_folder: Optional[Path] = None, run: Optional[Run] = None) -> List[str]: """ Compares a set of files in a folder, against files in either the other folder or files stored in the given AzureML run. Each file that is present in the "expected" folder must be also present in the "actual" folder (or the AzureML run), with exactly the same contents, in the same folder structure. For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file "<actual>/foo/bar/contents.txt" :param expected_folder: A folder with files that are expected to be present. :param actual_folder: The output folder with the actually produced files. :param run: An AzureML run :param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy. If 0.0, do not allow any discrepancy. :return: A list of human readable error messages, with message and file path. If no errors are found, the list is empty. """ messages = [] if run and is_offline_run_context(run): logging.warning( "Skipping file comparison because the given run context is an AzureML offline run." ) return [] files_in_run: List[str] = run.get_file_names() if run else [] temp_folder = Path(tempfile.mkdtemp()) if run else None for file in expected_folder.rglob("*"): # rglob also returns folders, skip those if file.is_dir(): continue # All files stored in AzureML runs use Linux-style path file_relative = file.relative_to(expected_folder).as_posix() if actual_folder: actual_file = actual_folder / file_relative elif temp_folder is not None and run is not None: actual_file = temp_folder / file_relative if file_relative in files_in_run: run.download_file(name=str(file_relative), output_file_path=str(actual_file)) else: raise ValueError( "One of the two arguments run, actual_folder must be provided." ) message = compare_files(expected=file, actual=actual_file, csv_relative_tolerance=csv_relative_tolerance ) if actual_file.exists() else MISSING_FILE if message: messages.append(f"{message}: {file_relative}") logging.info(f"File {file_relative}: {message or 'OK'}") if temp_folder: shutil.rmtree(temp_folder) return messages
def score_image(args: ScorePipelineConfig) -> Path: """ Perform model inference on a single image. By doing the following: 1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer) 2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot 3) Store the segmentation file in the current directory 4) Upload the segmentation to AML :param args: :return: """ logging.getLogger().setLevel(logging.INFO) score_py_folder = Path(__file__).parent model_folder = Path(args.model_folder or str(score_py_folder)) run_context = Run.get_context() logging.info(f"Run context={run_context.id}") if args.use_dicom: # Only a single zip file is supported. if len(args.image_files) > 1: raise ValueError("Supply exactly one zip file in args.images.") input_zip_file = check_input_file(args.data_folder, args.image_files[0]) reference_series_folder = model_folder / "temp_extraction" nifti_filename = model_folder / "temp_nifti.nii.gz" convert_zipped_dicom_to_nifti(input_zip_file, reference_series_folder, nifti_filename) test_images = [nifti_filename] else: test_images = [ check_input_file(args.data_folder, file) for file in args.image_files ] images = [load_nifti_image(file) for file in test_images] inference_pipeline, config = init_from_model_inference_json( model_folder, args.use_gpu) segmentation = run_inference(images, inference_pipeline, config) segmentation_file_name = model_folder / args.result_image_name result_dst = store_as_ubyte_nifti(segmentation, images[0].header, segmentation_file_name) if args.use_dicom: result_dst = convert_nifti_to_zipped_dicom_rt( result_dst, reference_series_folder, model_folder, config, args.result_zip_dicom_name, args.model_id) if not is_offline_run_context(run_context): upload_file_name = args.result_zip_dicom_name if args.use_dicom else args.result_image_name run_context.upload_file(upload_file_name, str(result_dst)) logging.info(f"Segmentation completed: {result_dst}") return result_dst
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None: """ Creates new file system settings (outputs folder, logs folder) based on the information stored in the present object. If any of the folders do not yet exist, they are created. :param project_root: The root folder for the codebase that triggers the training run. """ self.file_system_config = DeepLearningFileSystemConfig.create( project_root=project_root, model_name=self.model_name, is_offline_run=is_offline_run_context(RUN_CONTEXT), output_to=self.output_to )
def log_to_azure(self, label: str, metric: float) -> None: """ Logs a metric as a key/value pair to AzureML. """ if not is_offline_run_context(RUN_CONTEXT): metric_name = self.logging_prefix + label RUN_CONTEXT.log(metric_name, metric) # When running in a cross validation setting, log all metrics to the hyperdrive parent run too, # so that we can easily overlay graphs across runs. if self.log_to_parent_run and PARENT_RUN_CONTEXT: if self.cross_validation_split_index > DEFAULT_CROSS_VALIDATION_SPLIT_INDEX: PARENT_RUN_CONTEXT.log(f"{metric_name}_Split{self.cross_validation_split_index}", metric)
def run(self) -> None: if self._interval_seconds <= 0: logging.warning( "Resource monitoring requires an interval that is larger than 0 seconds, but " "got: {}. Exiting.".format(self._interval_seconds)) logging.info("Process ({}) started with pid: {}".format( self.name, self.pid)) # create the TB writers and AML run context for this process writer = tensorboardX.SummaryWriter(self._tb_log_file_path) run_context = Run.get_context() is_offline_run = is_offline_run_context(run_context) current_iteration = 0 def log_to_azure_and_tb(label: str, value: float) -> None: writer.add_scalar(label, value, current_iteration) if not is_offline_run: run_context.log(label, value) gpu_available = is_gpu_available() while True: if gpu_available: gpus: List[GPU] = GPUtil.getGPUs() if len(gpus) > 0: for gpu in gpus: log_to_azure_and_tb( 'Diagnostics/GPU_{}_Load_Percent'.format(gpu.id), gpu.load * 100) log_to_azure_and_tb( 'Diagnostics/GPU_{}_MemUtil_Percent'.format( gpu.id), gpu.memoryUtil * 100) # log the average GPU usage log_to_azure_and_tb( 'Diagnostics/Average_GPU_Load_Percent', statistics.mean(map(lambda x: x.load, gpus)) * 100) log_to_azure_and_tb( 'Diagnostics/Average_GPU_MemUtil_Percent', statistics.mean(map(lambda x: x.memoryUtil, gpus)) * 100) # log the CPU util log_to_azure_and_tb('Diagnostics/CPU_Util_Percent', psutil.cpu_percent(interval=None)) log_to_azure_and_tb('Diagnostics/CPU_MemUtil_Percent', psutil.virtual_memory()[2]) current_iteration += 1 # pause the thread for the requested delay time.sleep(self._interval_seconds)
def print_git_tags(self) -> None: """ When running in AzureML, print all the tags that contain information about the git repository status, for answering the question "which code version was used" from a log file only. """ git_tags = get_git_tags(self.azure_config) if is_offline_run_context(RUN_CONTEXT): # When running on a VM outside AzureML, we can read git information from the current repository tags_to_print = git_tags else: # When running in AzureML, the git repo information is not necessarily passed in, but we copy the git # information into run tags after submitting the job, and can read it out here. # Only print out those tags that were created from git-related information tags_to_print = {key: value for key, value in RUN_CONTEXT.get_tags().items() if key in git_tags} logging.info("Git repository information:") for key, value in tags_to_print.items(): logging.info(f" {key:20}: {value}")
def score_image(args: ScorePipelineConfig) -> Path: """ Perform model inference on a single image. By doing the following: 1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer) 2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot 3) Store the segmentation file in the current directory 4) Upload the segmentation to AML :param args: :return: """ logging.getLogger().setLevel(logging.INFO) score_py_folder = Path(__file__).parent model_folder = Path(args.model_folder or str(score_py_folder)) run_context = Run.get_context() logging.info(f"Run context={run_context.id}") test_images = [] data_folder = args.data_folder for file in args.image_files: full_file_path = data_folder / file if not full_file_path.exists(): message = \ str(data_folder) if data_folder.is_absolute() else f"{data_folder}, absolute: {data_folder.absolute()}" raise ValueError( f"File {file} does not exist in data folder {message}") test_images.append(full_file_path) images = [load_nifti_image(file) for file in test_images] inference_pipeline, config = init_from_model_inference_json( model_folder, args.use_gpu) segmentation = run_inference(images, inference_pipeline, config) segmentation_file_name = str(model_folder / args.result_image_name) result_dst = store_as_ubyte_nifti(segmentation, images[0].header, segmentation_file_name) if not is_offline_run_context(run_context): run_context.upload_file(args.result_image_name, segmentation_file_name) logging.info(f"Segmentation completed: {result_dst}") return result_dst
def score_image(args: ScorePipelineConfig) -> Path: """ Perform model inference on a single image. By doing the following: 1) Copy the provided data root directory to the root (this contains the model checkpoints and image to infer) 2) Instantiate an inference pipeline based on the provided model_inference.json in the snapshot 3) Store the segmentation file in the current directory 4) Upload the segmentation to AML :param args: :return: """ logging.getLogger().setLevel(logging.INFO) project_root = Path(args.project_root) # copy the model to the current directory copy_tree(args.data_root, str(project_root)) logging.info( f'Copied contents of data_root: {args.data_root} to {project_root}') run_context = Run.get_context() logging.info(f"Run context={run_context.id}") images = [ load_nifti_image(project_root / DEFAULT_DATA_FOLDER / x) for x in args.test_image_channels ] inference_pipeline, config = init_from_model_inference_json( project_root, args.use_gpu) segmentation = run_inference(images, inference_pipeline, config) segmentation_file_name = str(project_root / args.result_image_name) result_dst = store_as_ubyte_nifti(segmentation, images[0].header, segmentation_file_name) if not is_offline_run_context(run_context): run_context.upload_file(args.result_image_name, segmentation_file_name) logging.info(f"Segmentation completed: {result_dst}") return Path(result_dst)
def __init__(self) -> None: super().__init__() self.is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
def register_segmentation_model( self, checkpoint_paths: List[Path], model_description: str, model_proc: ModelProcessing ) -> Tuple[Optional[Model], Optional[Any]]: """ Registers a new model in the workspace's model registry to be deployed further, and creates a model zip for portal deployment (if required). This model is the model checkpoint with the highest test accuracy. :param model_description: A string description that is added to the deployed model. It would usually contain the test set performance and information at which epoch the result was achieved. :param checkpoint_paths: Checkpoint paths to use to upload model checkpoints to AML. :param model_proc: whether it's a single or ensemble model. :returns Tuple element 1: AML model object, or None if no model could be registered. Tuple element 2: The result of running the model_deployment_hook, or None if no hook was supplied. """ is_offline_run = is_offline_run_context(RUN_CONTEXT) workspace = None # Terminate early if this is running outside AzureML, and we can't access the AzureML workspace. This # saves time copying around files. if is_offline_run: try: workspace = self.azure_config.get_workspace() except Exception: logging.warning( "Unable to retrieve AzureML workspace. Was the Azure setup completed?" ) logging.info("No model was registered in AzureML.") return None, None # The files for the final model can't live in the outputs folder. If they do: when registering the model, # the files may not yet uploaded by hosttools, and that may (or not) cause errors. Hence, place the folder # for the final models outside of "outputs", and upload manually. model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder # Copy all code from project and InnerEye into the model folder, and copy over checkpoints. # This increases the size of the data stored for the run. The other option would be to store all checkpoints # right in the final model folder - however, then that would also contain any other checkpoints that the model # produced or downloaded for recovery, bloating the final model file. self.copy_child_paths_to_folder(final_model_folder, checkpoint_paths) logging.info("Registering the model on the workspace.") if is_offline_run: model_description = model_description + f"\nModel built by {self.azure_config.build_user} outside AzureML" model = Model.register(workspace=workspace, model_name=self.model_config.model_name, model_path=str(final_model_folder), description=model_description) else: # This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model" artifacts_path = model_subfolder # If the present run is a child run of a Hyperdrive parent run, and we are building an ensemble model, # register it the model on the parent run. if PARENT_RUN_CONTEXT and model_proc == ModelProcessing.ENSEMBLE_CREATION: run_to_register_on = PARENT_RUN_CONTEXT logging.info( f"Registering the model on the parent run {run_to_register_on.id}" ) else: run_to_register_on = RUN_CONTEXT logging.info( f"Registering the model on the current run {run_to_register_on.id}" ) logging.info( f"Uploading files in {final_model_folder} with prefix '{artifacts_path}'" ) final_model_folder_relative = final_model_folder.relative_to( Path.cwd()) run_to_register_on.upload_folder( name=artifacts_path, path=str(final_model_folder_relative)) # When registering the model on the run, we need to provide a relative path inside of the run's output # folder in `model_path` model = run_to_register_on.register_model( model_name=self.model_config.model_name, model_path=artifacts_path, tags=RUN_CONTEXT.get_tags(), description=model_description) deployment_result = None logging.info( f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}" ) # update the run's tags with the registered model information if not is_offline_run: update_run_tags(RUN_CONTEXT, {MODEL_ID_KEY_NAME: model.id}) # create a version of the model for deployment if the hook is provided if self.model_deployment_hook is not None: assert isinstance(self.model_config, SegmentationModelBase) deployment_result = self.model_deployment_hook( self.model_config, self.azure_config, model, model_proc) return model, deployment_result
def model_train(config: ModelConfigBase, checkpoint_handler: CheckpointHandler) -> ModelTrainingResults: """ The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :raises TypeError: If the arguments are of the wrong type. :raises ValueError: When there are issues loading a previous checkpoint. """ # Save the dataset files for later use in cross validation analysis config.write_dataset_files() # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization") # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't # want training to depend on how many patients we visualized, and hence set the random seed again right after. with logging_section( "Visualizing the effect of sampling random crops for training"): visualize_random_crops_for_dataset(config) ml_util.set_random_seed(config.get_effective_random_seed(), "Model training") logging.debug("Creating the PyTorch model.") # Create the train loader and validation loader to load images from the dataset data_loaders = config.create_data_loaders() # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() models_and_optimizer = ModelAndInfo( config=config, model_execution_mode=ModelExecutionMode.TRAIN, checkpoint_path=checkpoint_path) # Create the main model # If continuing from a previous run at a specific epoch, then load the previous model. model_loaded = models_and_optimizer.try_create_model_and_load_from_checkpoint( ) if not model_loaded: raise ValueError( "There was no checkpoint file available for the model for given start_epoch {}" .format(config.start_epoch)) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, models_and_optimizer.model) # Move model to GPU and adjust for multiple GPUs models_and_optimizer.adjust_model_for_gpus() # Create the mean teacher model and move to GPU if config.compute_mean_teacher_model: mean_teacher_model_loaded = models_and_optimizer.try_create_mean_teacher_model_load_from_checkpoint_and_adjust( ) if not mean_teacher_model_loaded: raise ValueError( "There was no checkpoint file available for the mean teacher model " f"for given start_epoch {config.start_epoch}") # Create optimizer models_and_optimizer.create_optimizer() if checkpoint_handler.should_load_optimizer_checkpoint(): optimizer_loaded = models_and_optimizer.try_load_checkpoint_for_optimizer( ) if not optimizer_loaded: raise ValueError( f"There was no checkpoint file available for the optimizer for given start_epoch " f"{config.start_epoch}") # Create checkpoint directory for this run if it doesn't already exist logging.info(f"Models are saved at {config.checkpoint_folder}") if not config.checkpoint_folder.is_dir(): config.checkpoint_folder.mkdir() # Create the SummaryWriters for Tensorboard writers = create_summary_writers(config) config.create_dataframe_loggers() # Create LR scheduler l_rate_scheduler = SchedulerWithWarmUp(config, models_and_optimizer.optimizer) # Training loop logging.info("Starting training") train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], [] resource_monitor = None if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring diagnostics_events = config.logs_folder / "diagnostics" logging.info( f"Starting resource monitor, outputting to {diagnostics_events}") resource_monitor = ResourceMonitor( interval_seconds=config.monitoring_interval_seconds, tensorboard_folder=diagnostics_events) resource_monitor.start() gradient_scaler = GradScaler( ) if config.use_gpu and config.use_mixed_precision else None optimal_temperature_scale_values = [] for epoch in config.get_train_epochs(): logging.info("Starting epoch {}".format(epoch)) save_epoch = config.should_save_epoch( epoch) and models_and_optimizer.optimizer is not None # store the learning rates used for each epoch epoch_lrs = l_rate_scheduler.get_last_lr() learning_rates_per_epoch.append(epoch_lrs) train_val_params: TrainValidateParameters = \ TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN], model=models_and_optimizer.model, mean_teacher_model=models_and_optimizer.mean_teacher_model, epoch=epoch, optimizer=models_and_optimizer.optimizer, gradient_scaler=gradient_scaler, epoch_learning_rate=epoch_lrs, summary_writers=writers, dataframe_loggers=config.metrics_data_frame_loggers, in_training_mode=True) training_steps = create_model_training_steps(config, train_val_params) train_epoch_results = train_or_validate_epoch(training_steps) train_results_per_epoch.append(train_epoch_results.metrics) metrics.validate_and_store_model_parameters(writers.train, epoch, models_and_optimizer.model) # Run without adjusting weights on the validation set train_val_params.in_training_mode = False train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL] # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs # as these will be re-computed after performing temperature scaling on the validation set. if isinstance(config, SequenceModelBase): train_val_params.save_metrics = not ( save_epoch and config.temperature_scaling_config) training_steps = create_model_training_steps(config, train_val_params) val_epoch_results = train_or_validate_epoch(training_steps) val_results_per_epoch.append(val_epoch_results.metrics) if config.is_segmentation_model: metrics.store_epoch_stats_for_segmentation( config.outputs_folder, epoch, epoch_lrs, train_epoch_results.metrics, val_epoch_results.metrics) if save_epoch: # perform temperature scaling if required if isinstance( config, SequenceModelBase) and config.temperature_scaling_config: optimal_temperature, scaled_val_results = \ temperature_scaling_steps(config, train_val_params, val_epoch_results) optimal_temperature_scale_values.append(optimal_temperature) # overwrite the metrics for the epoch with the metrics from the temperature scaled model val_results_per_epoch[-1] = scaled_val_results.metrics models_and_optimizer.save_checkpoint(epoch) # Updating the learning rate should happen at the end of the training loop, so that the # initial learning rate will be used for the very first epoch. l_rate_scheduler.step() model_training_results = ModelTrainingResults( train_results_per_epoch=train_results_per_epoch, val_results_per_epoch=val_results_per_epoch, learning_rates_per_epoch=learning_rates_per_epoch, optimal_temperature_scale_values_per_checkpoint_epoch= optimal_temperature_scale_values) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) writers.close_all() config.metrics_data_frame_loggers.close_all() if resource_monitor: # stop the resource monitoring process logging.info( "Shutting down the resource monitor process. Aggregate resource utilization:" ) for name, value in resource_monitor.read_aggregate_metrics(): logging.info(f"{name}: {value}") if not is_offline_run_context(RUN_CONTEXT): RUN_CONTEXT.log(name, value) resource_monitor.kill() return model_training_results
def create_and_submit_experiment(azure_config: AzureConfig, script_run_config: ScriptRunConfig, commandline_args: str) -> Run: """ Creates an AzureML experiment in the workspace and submits it for execution. :param azure_config: azure related configurations to setup a valid workspace. :param script_run_config: The configuration for the script that should be run inside of AzureML. :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only used to set a tag on the submitted AzureML run. :returns: Run object for the submitted AzureML run """ workspace = azure_config.get_workspace() experiment_name = create_experiment_name(azure_config) exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name)) # submit a training/testing run associated with the experiment run: Run = exp.submit(script_run_config) if is_offline_run_context(run): # This codepath will only be executed in unit tests, when exp.submit is mocked. return run # Set metadata for the run. set_run_tags(run, azure_config, commandline_args=commandline_args) print( "\n==============================================================================" ) print(f"Successfully queued new run {run.id} in experiment: {exp.name}") if azure_config.run_recovery_id: print(f"\nRecovered from: {azure_config.run_recovery_id}") recovery_id = azure_util.create_run_recovery_id(run) recovery_file = Path(RUN_RECOVERY_FILE) if recovery_file.exists(): recovery_file.unlink() recovery_file.write_text(recovery_id) print("Experiment URL: {}".format(exp.get_portal_url())) print("Run URL: {}".format(run.get_portal_url())) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") print( f"The run recovery ID has been written to this file: {recovery_file}") print( "==============================================================================" ) if azure_config.tensorboard and azure_config.azureml: print("Starting TensorBoard now because you specified --tensorboard") monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}") print( "==============================================================================" ) return run
def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, num_nodes: int = 1, multiple_trainloader_mode: str = "max_size_cycle") -> \ Tuple[Trainer, StoringLogger]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second return value. :param container: The container with model and data. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :param num_nodes: The number of nodes to use in distributed training. :return: A tuple [Trainer object, diagnostic logger] """ logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}") num_gpus = container.num_gpus_per_node() effective_num_gpus = num_gpus * num_nodes strategy = None if effective_num_gpus == 0: accelerator = "cpu" devices = 1 message = "CPU" else: accelerator = "gpu" devices = num_gpus message = f"{devices} GPU" if effective_num_gpus > 1: # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of # GPU memory). # Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin # prints out lengthy warnings about the performance impact of find_unused_parameters. strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters) message += "s per node with DDP" logging.info(f"Using {message}") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") loggers = [tensorboard_logger, AzureMLLogger(False)] storing_logger = StoringLogger() loggers.append(storing_logger) # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32 # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark # https://pytorch.org/docs/stable/notes/randomness.html # Note that switching to deterministic models can have large performance downside. if container.pl_deterministic: deterministic = True benchmark = False else: deterministic = False benchmark = True # The last checkpoint is considered the "best" checkpoint. For large segmentation # models, this still appears to be the best way of choosing them because validation loss on the relatively small # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but # not for the HeadAndNeck model. # Note that "last" is somehow a misnomer, it should rather be "latest". There is a "last" checkpoint written in # every epoch. We could use that for recovery too, but it could happen that the job gets preempted right during # writing that file, and we would end up with an invalid file. last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), save_last=True, save_top_k=0) recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), filename=AUTOSAVE_CHECKPOINT_FILE_NAME, every_n_val_epochs=container.autosave_every_n_val_epochs, save_last=False) callbacks: List[Callback] = [ last_checkpoint_callback, recovery_checkpoint_callback, ] if container.monitor_loading: # TODO antonsc: Remove after fixing the callback. raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.") # callbacks.append(BatchTimeCallback()) if num_gpus > 0 and container.monitor_gpu: logging.info("Adding monitoring for GPU utilization") callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True)) # Add the additional callbacks that were specified in get_trainer_arguments for LightningContainers additional_args = container.get_trainer_arguments() # Callbacks can be specified via the "callbacks" argument (the legacy behaviour) or the new get_callbacks method if "callbacks" in additional_args: more_callbacks = additional_args.pop("callbacks") if isinstance(more_callbacks, list): callbacks.extend(more_callbacks) # type: ignore else: callbacks.append(more_callbacks) # type: ignore callbacks.extend(container.get_callbacks()) is_azureml_run = not is_offline_run_context(RUN_CONTEXT) progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate if progress_bar_refresh_rate is None: progress_bar_refresh_rate = 50 logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. " f"To change, modify the pl_progress_bar_refresh_rate field of the container.") if is_azureml_run: callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate, write_to_logging_info=True, print_timestamp=False)) else: callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate)) # Read out additional model-specific args here. # We probably want to keep essential ones like numgpu and logging. trainer = Trainer(default_root_dir=str(container.outputs_folder), deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, strategy=strategy, max_epochs=container.num_epochs, # Both these arguments can be integers or floats. If integers, it is the number of batches. # If float, it's the fraction of batches. We default to 1.0 (processing all batches). limit_train_batches=container.pl_limit_train_batches or 1.0, limit_val_batches=container.pl_limit_val_batches or 1.0, num_sanity_val_steps=container.pl_num_sanity_val_steps, check_val_every_n_epoch=container.pl_check_val_every_n_epoch, callbacks=callbacks, logger=loggers, num_nodes=num_nodes, devices=devices, precision=precision, sync_batchnorm=True, detect_anomaly=container.detect_anomaly, profiler=container.pl_profiler, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, multiple_trainloader_mode=multiple_trainloader_mode, **additional_args) return trainer, storing_logger
def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, num_nodes: int = 1, **kwargs: Dict[str, Any]) -> \ Tuple[Trainer, Optional[StoringLogger]]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second return value. :param container: The container with model and data. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :param num_nodes: The number of nodes to use in distributed training. :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer. :return: A tuple [Trainer object, diagnostic logger] """ # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation # models, this still appears to be the best way of choosing them because validation loss on the relatively small # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but # not for the HeadAndNeck model. best_checkpoint_callback = ModelCheckpoint( dirpath=str(container.checkpoint_folder), # filename=BEST_CHECKPOINT_FILE_NAME, # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}", # save_top_k=1, save_last=True) # Recovery checkpoints: {epoch} will turn into a string like "epoch=1" # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs, keep the last # recovery_checkpoints_save_last_k. recovery_checkpoint_callback = InnerEyeRecoveryCheckpointCallback( container) num_gpus = container.num_gpus_per_node effective_num_gpus = num_gpus * num_nodes # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory). # For unit tests, only "ddp_spawn" works accelerator = "ddp" if effective_num_gpus > 1 else None if effective_num_gpus > 1: # Initialize the DDP plugin with find_unused_parameters=False by default. If True (default), it prints out # lengthy warnings about the performance impact of find_unused_parameters plugins = [ InnerEyeDDPPlugin( num_nodes=num_nodes, sync_batchnorm=True, find_unused_parameters=container.pl_find_unused_parameters) ] else: plugins = [] logging.info( f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") loggers = [tensorboard_logger, AzureMLLogger()] storing_logger: Optional[StoringLogger] if isinstance(container, InnerEyeContainer): storing_logger = StoringLogger() loggers.append(storing_logger) else: storing_logger = None # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32 # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark # https://pytorch.org/docs/stable/notes/randomness.html # For the classification models, we observed only a small performance deterioration (increase in 10sec on total # training time of 22min) when switching to deterministic. if container.pl_deterministic: deterministic = True benchmark = False else: deterministic = False benchmark = True # If the users provides additional callbacks via get_trainer_arguments (for custom # containers callbacks = [best_checkpoint_callback, recovery_checkpoint_callback] if "callbacks" in kwargs: callbacks.append(kwargs.pop("callbacks")) # type: ignore is_azureml_run = not is_offline_run_context(RUN_CONTEXT) progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate if progress_bar_refresh_rate is None and is_azureml_run: # When running in AzureML, the default progress bar clutters the output files with thousands of lines. progress_bar_refresh_rate = 50 logging.info( f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. " f"To change, modify the pl_progress_bar_refresh_rate field of the container." ) # Read out additional model-specific args here. # We probably want to keep essential ones like numgpu and logging. trainer = Trainer(default_root_dir=str(container.outputs_folder), deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, max_epochs=container.num_epochs, num_sanity_val_steps=container.pl_num_sanity_val_steps, callbacks=callbacks, logger=loggers, progress_bar_refresh_rate=progress_bar_refresh_rate, num_nodes=num_nodes, gpus=num_gpus, precision=precision, sync_batchnorm=True, terminate_on_nan=container.detect_anomaly, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, plugins=plugins, **kwargs) return trainer, storing_logger
def is_offline_run(self) -> bool: """ Returns True if the run is executing outside AzureML, or False if inside AzureML. """ return is_offline_run_context(RUN_CONTEXT)
def model_train(checkpoint_handler: CheckpointHandler, container: LightningContainer, num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]: """ The main training loop. It creates the Pytorch model based on the configuration options passed in, creates a Pytorch Lightning trainer, and trains the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :param num_nodes: The number of nodes to use in distributed training. :param container: A container object that holds the training data in PyTorch Lightning format and the model to train. :return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when fitting other models. """ # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() lightning_model = container.model resource_monitor: Optional[ResourceMonitor] = None # Execute some bookkeeping tasks only once if running distributed: if is_global_rank_zero(): logging.info( f"Model checkpoints are saved at {container.checkpoint_folder}") write_args_file(container.config if isinstance( container, InnerEyeContainer) else container, outputs_folder=container.outputs_folder) if container.monitoring_interval_seconds > 0: resource_monitor = start_resource_monitor(container) # Run all of the container-related operations consistently with changed outputs folder, even ones that # should not rely on the current working directory, like get_data_module. with change_working_directory(container.outputs_folder): data_module = container.get_data_module() if is_global_rank_zero(): container.before_training_on_global_rank_zero() if is_local_rank_zero(): container.before_training_on_local_rank_zero() container.before_training_on_all_ranks() # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second # training in the unit tests.d old_environ = dict(os.environ) # Set random seeds just before training. For segmentation models, we have # something that changes the random seed in the before_training_on_rank_zero hook. seed_everything(container.get_effective_random_seed()) trainer, storing_logger = create_lightning_trainer( container, checkpoint_path, num_nodes=num_nodes, **container.get_trainer_arguments()) rank_info = ", ".join( f"{env}: {os.getenv(env)}" for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK]) logging.info( f"Environment variables: {rank_info}. trainer.global_rank: {trainer.global_rank}" ) # InnerEye models use this logger for diagnostics if isinstance(lightning_model, InnerEyeLightning): if storing_logger is None: raise ValueError( "InnerEye models require the storing_logger for diagnostics") lightning_model.storing_logger = storing_logger logging.info("Starting training") # When training models that are not built-in InnerEye models, we have no guarantee that they write # files to the right folder. Best guess is to change the current working directory to where files should go. with change_working_directory(container.outputs_folder): trainer.fit(lightning_model, datamodule=data_module) trainer.logger.close() # type: ignore world_size = getattr(trainer, "world_size", 0) is_azureml_run = not is_offline_run_context(RUN_CONTEXT) # Per-subject model outputs for regression models are written per rank, and need to be aggregated here. # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them. if is_azureml_run and world_size > 1 and isinstance( lightning_model, ScalarLightning): upload_output_file_as_temp( lightning_model.train_subject_outputs_logger.csv_path, container.outputs_folder) upload_output_file_as_temp( lightning_model.val_subject_outputs_logger.csv_path, container.outputs_folder) # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training. # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set # all necessary properties. if lightning_model.global_rank != 0: logging.info( f"Terminating training thread with rank {lightning_model.global_rank}." ) sys.exit() logging.info("Choosing the best checkpoint and removing redundant files.") create_best_checkpoint(container.checkpoint_folder) # Lightning modifies a ton of environment variables. If we first run training and then the test suite, # those environment variables will mislead the training runs in the test suite, and make them crash. # Hence, restore the original environment after training. os.environ.clear() os.environ.update(old_environ) if world_size and isinstance(lightning_model, ScalarLightning): if is_azureml_run and world_size > 1: # In a DDP run on the local box, all ranks will write to local disk, hence no download needed. # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and # concatenate for rank in range(world_size): for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: file = mode.value + "/" + get_subject_output_file_per_rank( rank) RUN_CONTEXT.download_file( name=TEMP_PREFIX + file, output_file_path=container.outputs_folder / file) # Concatenate all temporary file per execution mode aggregate_and_create_subject_metrics_file(container.outputs_folder) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it in the Azure UI. if isinstance(container, InnerEyeContainer): if container.config.max_batch_grad_cam > 0 and container.visualization_folder.exists( ): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(container.visualization_folder)) if resource_monitor: logging.info("Shutting down the resource monitor process.") if is_azureml_run: for gpu_name, metrics_per_gpu in resource_monitor.read_aggregate_metrics( ).items(): # Log as a table, with GPU being the first column RUN_CONTEXT.log_row("GPU utilization", GPU=gpu_name, **metrics_per_gpu) resource_monitor.kill() return trainer, storing_logger