def test_resource_monitor_store_to_file(test_output_dirs: OutputFolderForTests) -> None: """ Test if storing metrics to a file works correctly. """ tensorboard_folder = test_output_dirs.root_dir r = ResourceMonitor(interval_seconds=5, tensorboard_folder=tensorboard_folder, csv_results_folder=tensorboard_folder) r.gpu_aggregates = { 1: GpuUtilization(id=1, mem_util=1, load=2, mem_reserved_gb=30.0, mem_allocated_gb=40.0, count=10), } r.gpu_max = { 1: GpuUtilization(id=1, mem_util=0.4, load=0.5, mem_reserved_gb=6.0, mem_allocated_gb=7.0, count=10), } r.store_to_file() # Write a second time - we expect that to overwrite and only produce one set of metrics r.store_to_file() parsed_metrics = r.read_aggregate_metrics() assert parsed_metrics == { "GPU1": { "MemUtil_Percent": 10.0, "Load_Percent": 20.0, "MemReserved_GB": 3.0, "MemAllocated_GB": 4.0, "MaxMemUtil_Percent": 40.0, "MaxLoad_Percent": 50.0, "MaxMemReserved_GB": 6.0, "MaxMemAllocated_GB": 7.0, }}
def test_resource_monitor(test_output_dirs: OutputFolderForTests) -> None: """ Test if metrics are correctly updated in the ResourceMonitor class. """ results_folder = test_output_dirs.root_dir r = ResourceMonitor(interval_seconds=5, tensorboard_folder=results_folder, csv_results_folder=results_folder) def create_gpu(id: int, load: float, mem_total: float, mem_used: float) -> GPU: return GPU(ID=id, uuid=None, load=load, memoryTotal=mem_total, memoryUsed=mem_used, memoryFree=None, driver=None, gpu_name=None, serial=None, display_mode=None, display_active=None, temp_gpu=None) # Fake objects coming from GPUtil: Two entries for GPU1, 1 entry only for GPU2 gpu1 = create_gpu(1, 0.1, 10, 2) # memUti=0.2 gpu2 = create_gpu(2, 0.2, 10, 3) # memUti=0.3 gpu3 = create_gpu(1, 0.3, 10, 5) # memUti=0.5 # Mock torch calls so that we can run on CPUs. memory allocated: 2GB, reserved: 1GB with mock.patch("torch.cuda.memory_allocated", return_value=2 ** 31): with mock.patch("torch.cuda.memory_reserved", return_value=2 ** 30): # Update with results for both GPUs r.update_metrics([gpu1, gpu2]) # Next update with data for GPU2 missing r.update_metrics([gpu3]) # Element-wise maximum of metrics assert r.gpu_max == { 1: GpuUtilization(id=1, load=0.3, mem_util=0.5, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=2), 2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1), } # Aggregates should contain the sum of metrics that were observed. assert r.gpu_aggregates == { 1: GpuUtilization(id=1, load=0.4, mem_util=0.7, mem_allocated_gb=4.0, mem_reserved_gb=2.0, count=2), 2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1), } r.writer.flush() r.store_to_file() tb_file = list(results_folder.rglob("*tfevents*"))[0] assert os.path.getsize(str(tb_file)) > 100 assert r.aggregate_metrics_file.is_file assert len(r.aggregate_metrics_file.read_text().splitlines()) == 17 parsed_metrics = r.read_aggregate_metrics() # There should be one entry per GPU assert len(parsed_metrics) == 2 # Each GPU has 4 averages, 4 max. assert len(parsed_metrics["GPU1"]) == 8 assert len(parsed_metrics["GPU2"]) == 8
def test_resource_monitor_store_to_file( test_output_dirs: TestOutputDirectories) -> None: """ Test if storing metrics to a file works correctly. """ tensorboard_folder = Path(test_output_dirs.root_dir) r = ResourceMonitor(interval_seconds=5, tensorboard_folder=tensorboard_folder) r.gpu_aggregates = { 1: GpuUtilization(id=1, mem_util=1, load=2, mem_reserved_gb=30.0, mem_allocated_gb=40.0, count=10), } r.gpu_max = { 1: GpuUtilization(id=1, mem_util=0.4, load=0.5, mem_reserved_gb=6.0, mem_allocated_gb=7.0, count=10), } r.store_to_file() # Write a second time - we expect that to overwrite and only produce one set of metrics r.store_to_file() parsed_metrics = r.read_aggregate_metrics() assert parsed_metrics == [ ("GPU1/MemUtil_Percent", 10.0), ("GPU1/Load_Percent", 20.0), ("GPU1/MemReserved_GB", 3.0), ("GPU1/MemAllocated_GB", 4.0), ("GPU1/MaxMemUtil_Percent", 40.0), ("GPU1/MaxLoad_Percent", 50.0), ("GPU1/MaxMemReserved_GB", 6.0), ("GPU1/MaxMemAllocated_GB", 7.0), ]
def model_train(config: ModelConfigBase, checkpoint_handler: CheckpointHandler) -> ModelTrainingResults: """ The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :raises TypeError: If the arguments are of the wrong type. :raises ValueError: When there are issues loading a previous checkpoint. """ # Save the dataset files for later use in cross validation analysis config.write_dataset_files() # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization") # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't # want training to depend on how many patients we visualized, and hence set the random seed again right after. with logging_section( "Visualizing the effect of sampling random crops for training"): visualize_random_crops_for_dataset(config) ml_util.set_random_seed(config.get_effective_random_seed(), "Model training") logging.debug("Creating the PyTorch model.") # Create the train loader and validation loader to load images from the dataset data_loaders = config.create_data_loaders() # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() models_and_optimizer = ModelAndInfo( config=config, model_execution_mode=ModelExecutionMode.TRAIN, checkpoint_path=checkpoint_path) # Create the main model # If continuing from a previous run at a specific epoch, then load the previous model. model_loaded = models_and_optimizer.try_create_model_and_load_from_checkpoint( ) if not model_loaded: raise ValueError( "There was no checkpoint file available for the model for given start_epoch {}" .format(config.start_epoch)) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, models_and_optimizer.model) # Move model to GPU and adjust for multiple GPUs models_and_optimizer.adjust_model_for_gpus() # Create the mean teacher model and move to GPU if config.compute_mean_teacher_model: mean_teacher_model_loaded = models_and_optimizer.try_create_mean_teacher_model_load_from_checkpoint_and_adjust( ) if not mean_teacher_model_loaded: raise ValueError( "There was no checkpoint file available for the mean teacher model " f"for given start_epoch {config.start_epoch}") # Create optimizer models_and_optimizer.create_optimizer() if checkpoint_handler.should_load_optimizer_checkpoint(): optimizer_loaded = models_and_optimizer.try_load_checkpoint_for_optimizer( ) if not optimizer_loaded: raise ValueError( f"There was no checkpoint file available for the optimizer for given start_epoch " f"{config.start_epoch}") # Create checkpoint directory for this run if it doesn't already exist logging.info(f"Models are saved at {config.checkpoint_folder}") if not config.checkpoint_folder.is_dir(): config.checkpoint_folder.mkdir() # Create the SummaryWriters for Tensorboard writers = create_summary_writers(config) config.create_dataframe_loggers() # Create LR scheduler l_rate_scheduler = SchedulerWithWarmUp(config, models_and_optimizer.optimizer) # Training loop logging.info("Starting training") train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], [] resource_monitor = None if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring diagnostics_events = config.logs_folder / "diagnostics" logging.info( f"Starting resource monitor, outputting to {diagnostics_events}") resource_monitor = ResourceMonitor( interval_seconds=config.monitoring_interval_seconds, tensorboard_folder=diagnostics_events) resource_monitor.start() gradient_scaler = GradScaler( ) if config.use_gpu and config.use_mixed_precision else None optimal_temperature_scale_values = [] for epoch in config.get_train_epochs(): logging.info("Starting epoch {}".format(epoch)) save_epoch = config.should_save_epoch( epoch) and models_and_optimizer.optimizer is not None # store the learning rates used for each epoch epoch_lrs = l_rate_scheduler.get_last_lr() learning_rates_per_epoch.append(epoch_lrs) train_val_params: TrainValidateParameters = \ TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN], model=models_and_optimizer.model, mean_teacher_model=models_and_optimizer.mean_teacher_model, epoch=epoch, optimizer=models_and_optimizer.optimizer, gradient_scaler=gradient_scaler, epoch_learning_rate=epoch_lrs, summary_writers=writers, dataframe_loggers=config.metrics_data_frame_loggers, in_training_mode=True) training_steps = create_model_training_steps(config, train_val_params) train_epoch_results = train_or_validate_epoch(training_steps) train_results_per_epoch.append(train_epoch_results.metrics) metrics.validate_and_store_model_parameters(writers.train, epoch, models_and_optimizer.model) # Run without adjusting weights on the validation set train_val_params.in_training_mode = False train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL] # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs # as these will be re-computed after performing temperature scaling on the validation set. if isinstance(config, SequenceModelBase): train_val_params.save_metrics = not ( save_epoch and config.temperature_scaling_config) training_steps = create_model_training_steps(config, train_val_params) val_epoch_results = train_or_validate_epoch(training_steps) val_results_per_epoch.append(val_epoch_results.metrics) if config.is_segmentation_model: metrics.store_epoch_stats_for_segmentation( config.outputs_folder, epoch, epoch_lrs, train_epoch_results.metrics, val_epoch_results.metrics) if save_epoch: # perform temperature scaling if required if isinstance( config, SequenceModelBase) and config.temperature_scaling_config: optimal_temperature, scaled_val_results = \ temperature_scaling_steps(config, train_val_params, val_epoch_results) optimal_temperature_scale_values.append(optimal_temperature) # overwrite the metrics for the epoch with the metrics from the temperature scaled model val_results_per_epoch[-1] = scaled_val_results.metrics models_and_optimizer.save_checkpoint(epoch) # Updating the learning rate should happen at the end of the training loop, so that the # initial learning rate will be used for the very first epoch. l_rate_scheduler.step() model_training_results = ModelTrainingResults( train_results_per_epoch=train_results_per_epoch, val_results_per_epoch=val_results_per_epoch, learning_rates_per_epoch=learning_rates_per_epoch, optimal_temperature_scale_values_per_checkpoint_epoch= optimal_temperature_scale_values) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) writers.close_all() config.metrics_data_frame_loggers.close_all() if resource_monitor: # stop the resource monitoring process logging.info( "Shutting down the resource monitor process. Aggregate resource utilization:" ) for name, value in resource_monitor.read_aggregate_metrics(): logging.info(f"{name}: {value}") if not is_offline_run_context(RUN_CONTEXT): RUN_CONTEXT.log(name, value) resource_monitor.kill() return model_training_results
def model_train(config: ModelConfigBase, checkpoint_handler: CheckpointHandler, num_nodes: int = 1) -> ModelTrainingResults: """ The main training loop. It creates the Pytorch model based on the configuration options passed in, creates a Pytorch Lightning trainer, and trains the model. If a checkpoint was specified, then it loads the checkpoint before resuming training. :param config: The arguments which specify all required information. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :param num_nodes: The number of nodes to use in distributed training. """ # Get the path to the checkpoint to recover from checkpoint_path = checkpoint_handler.get_recovery_path_train() # This reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder # for categorical features, that need to be available before creating the model. config.read_dataset_if_needed() # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second # training in the unit tests.d old_environ = dict(os.environ) seed_everything(config.get_effective_random_seed()) trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes) logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. " f"trainer.global_rank: {trainer.global_rank}") logging.debug("Creating the PyTorch model.") lightning_model = create_lightning_model(config) lightning_model.storing_logger = storing_logger resource_monitor = None # Execute some bookkeeping tasks only once if running distributed: if is_rank_zero(): config.write_args_file() logging.info(str(config)) # Save the dataset files for later use in cross validation analysis config.write_dataset_files() logging.info(f"Model checkpoints are saved at {config.checkpoint_folder}") # set the random seed for all libraries ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization") # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't # want training to depend on how many patients we visualized, and hence set the random seed again right after. with logging_section("Visualizing the effect of sampling random crops for training"): visualize_random_crops_for_dataset(config) # Print out a detailed breakdown of layers, memory consumption and time. generate_and_print_model_summary(config, lightning_model.model) if config.monitoring_interval_seconds > 0: # initialize and start GPU monitoring diagnostics_events = config.logs_folder / "diagnostics" logging.info(f"Starting resource monitor, outputting to {diagnostics_events}") resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds, tensorboard_folder=diagnostics_events) resource_monitor.start() # Training loop logging.info("Starting training") lightning_data = TrainingAndValidationDataLightning(config) # type: ignore # When trying to store the config object in the constructor, it does not appear to get stored at all, later # reference of the object simply fail. Hence, have to set explicitly here. lightning_data.config = config trainer.fit(lightning_model, datamodule=lightning_data) trainer.logger.close() # type: ignore lightning_model.close_all_loggers() world_size = getattr(trainer, "world_size", 0) is_azureml_run = not config.is_offline_run # Per-subject model outputs for regression models are written per rank, and need to be aggregated here. # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them. if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning): upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, config.outputs_folder) upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, config.outputs_folder) # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training. # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set # all necessary properties. if lightning_model.global_rank != 0: logging.info(f"Terminating training thread with rank {lightning_model.global_rank}.") sys.exit() logging.info("Choosing the best checkpoint and removing redundant files.") cleanup_checkpoint_folder(config.checkpoint_folder) # Lightning modifies a ton of environment variables. If we first run training and then the test suite, # those environment variables will mislead the training runs in the test suite, and make them crash. # Hence, restore the original environment after training. os.environ.clear() os.environ.update(old_environ) if world_size and isinstance(lightning_model, ScalarLightning): if is_azureml_run and world_size > 1: # In a DDP run on the local box, all ranks will write to local disk, hence no download needed. # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and # concatenate for rank in range(world_size): for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: file = mode.value + "/" + get_subject_output_file_per_rank(rank) RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=config.outputs_folder / file) # Concatenate all temporary file per execution mode for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]: temp_files = (config.outputs_folder / mode.value).rglob(SUBJECT_OUTPUT_PER_RANK_PREFIX + "*") result_file = config.outputs_folder / mode.value / SUBJECT_METRICS_FILE_NAME for i, file in enumerate(temp_files): temp_file_contents = file.read_text() if i == 0: # Copy the first file as-is, including the first line with the column headers result_file.write_text(temp_file_contents) else: # For all files but the first one, cut off the header line. result_file.write_text(os.linesep.join(temp_file_contents.splitlines()[1:])) model_training_results = ModelTrainingResults( train_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values()), val_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values()), train_diagnostics=lightning_model.train_diagnostics, val_diagnostics=lightning_model.val_diagnostics, optimal_temperature_scale_values_per_checkpoint_epoch=[] ) logging.info("Finished training") # Since we have trained the model further, let the checkpoint_handler object know so it can handle # checkpoints correctly. checkpoint_handler.additional_training_done() # Upload visualization directory to AML run context to be able to see it # in the Azure UI. if config.max_batch_grad_cam > 0 and config.visualization_folder.exists(): RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder)) if resource_monitor: # stop the resource monitoring process logging.info("Shutting down the resource monitor process. Aggregate resource utilization:") for name, value in resource_monitor.read_aggregate_metrics(): logging.info(f"{name}: {value}") if not config.is_offline_run: RUN_CONTEXT.log(name, value) resource_monitor.kill() return model_training_results