コード例 #1
0
def test_resource_monitor_store_to_file(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if storing metrics to a file works correctly.
    """
    tensorboard_folder = test_output_dirs.root_dir
    r = ResourceMonitor(interval_seconds=5,
                        tensorboard_folder=tensorboard_folder,
                        csv_results_folder=tensorboard_folder)
    r.gpu_aggregates = {
        1: GpuUtilization(id=1, mem_util=1, load=2, mem_reserved_gb=30.0, mem_allocated_gb=40.0, count=10),
    }
    r.gpu_max = {
        1: GpuUtilization(id=1, mem_util=0.4, load=0.5, mem_reserved_gb=6.0, mem_allocated_gb=7.0, count=10),
    }
    r.store_to_file()
    # Write a second time - we expect that to overwrite and only produce one set of metrics
    r.store_to_file()
    parsed_metrics = r.read_aggregate_metrics()
    assert parsed_metrics == {
        "GPU1": {
            "MemUtil_Percent": 10.0,
            "Load_Percent": 20.0,
            "MemReserved_GB": 3.0,
            "MemAllocated_GB": 4.0,
            "MaxMemUtil_Percent": 40.0,
            "MaxLoad_Percent": 50.0,
            "MaxMemReserved_GB": 6.0,
            "MaxMemAllocated_GB": 7.0,
        }}
コード例 #2
0
def test_resource_monitor(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if metrics are correctly updated in the ResourceMonitor class.
    """
    results_folder = test_output_dirs.root_dir
    r = ResourceMonitor(interval_seconds=5, tensorboard_folder=results_folder, csv_results_folder=results_folder)

    def create_gpu(id: int, load: float, mem_total: float, mem_used: float) -> GPU:
        return GPU(ID=id, uuid=None, load=load, memoryTotal=mem_total, memoryUsed=mem_used,
                   memoryFree=None, driver=None, gpu_name=None,
                   serial=None, display_mode=None, display_active=None, temp_gpu=None)

    # Fake objects coming from GPUtil: Two entries for GPU1, 1 entry only for GPU2
    gpu1 = create_gpu(1, 0.1, 10, 2)  # memUti=0.2
    gpu2 = create_gpu(2, 0.2, 10, 3)  # memUti=0.3
    gpu3 = create_gpu(1, 0.3, 10, 5)  # memUti=0.5
    # Mock torch calls so that we can run on CPUs. memory allocated: 2GB, reserved: 1GB
    with mock.patch("torch.cuda.memory_allocated", return_value=2 ** 31):
        with mock.patch("torch.cuda.memory_reserved", return_value=2 ** 30):
            # Update with results for both GPUs
            r.update_metrics([gpu1, gpu2])
            # Next update with data for GPU2 missing
            r.update_metrics([gpu3])
    # Element-wise maximum of metrics
    assert r.gpu_max == {
        1: GpuUtilization(id=1, load=0.3, mem_util=0.5, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=2),
        2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1),
    }
    # Aggregates should contain the sum of metrics that were observed.
    assert r.gpu_aggregates == {
        1: GpuUtilization(id=1, load=0.4, mem_util=0.7, mem_allocated_gb=4.0, mem_reserved_gb=2.0, count=2),
        2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1),
    }
    r.writer.flush()
    r.store_to_file()
    tb_file = list(results_folder.rglob("*tfevents*"))[0]
    assert os.path.getsize(str(tb_file)) > 100
    assert r.aggregate_metrics_file.is_file
    assert len(r.aggregate_metrics_file.read_text().splitlines()) == 17
    parsed_metrics = r.read_aggregate_metrics()
    # There should be one entry per GPU
    assert len(parsed_metrics) == 2
    # Each GPU has 4 averages, 4 max.
    assert len(parsed_metrics["GPU1"]) == 8
    assert len(parsed_metrics["GPU2"]) == 8
コード例 #3
0
def test_resource_monitor_store_to_file(
        test_output_dirs: TestOutputDirectories) -> None:
    """
    Test if storing metrics to a file works correctly.
    """
    tensorboard_folder = Path(test_output_dirs.root_dir)
    r = ResourceMonitor(interval_seconds=5,
                        tensorboard_folder=tensorboard_folder)
    r.gpu_aggregates = {
        1:
        GpuUtilization(id=1,
                       mem_util=1,
                       load=2,
                       mem_reserved_gb=30.0,
                       mem_allocated_gb=40.0,
                       count=10),
    }
    r.gpu_max = {
        1:
        GpuUtilization(id=1,
                       mem_util=0.4,
                       load=0.5,
                       mem_reserved_gb=6.0,
                       mem_allocated_gb=7.0,
                       count=10),
    }
    r.store_to_file()
    # Write a second time - we expect that to overwrite and only produce one set of metrics
    r.store_to_file()
    parsed_metrics = r.read_aggregate_metrics()
    assert parsed_metrics == [
        ("GPU1/MemUtil_Percent", 10.0),
        ("GPU1/Load_Percent", 20.0),
        ("GPU1/MemReserved_GB", 3.0),
        ("GPU1/MemAllocated_GB", 4.0),
        ("GPU1/MaxMemUtil_Percent", 40.0),
        ("GPU1/MaxLoad_Percent", 50.0),
        ("GPU1/MaxMemReserved_GB", 6.0),
        ("GPU1/MaxMemAllocated_GB", 7.0),
    ]
コード例 #4
0
def model_train(config: ModelConfigBase,
                checkpoint_handler: CheckpointHandler) -> ModelTrainingResults:
    """
    The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds
    to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training.

    :param config: The arguments which specify all required information.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
    :raises TypeError: If the arguments are of the wrong type.
    :raises ValueError: When there are issues loading a previous checkpoint.
    """
    # Save the dataset files for later use in cross validation analysis
    config.write_dataset_files()

    # set the random seed for all libraries
    ml_util.set_random_seed(config.get_effective_random_seed(),
                            "Patch visualization")
    # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't
    # want training to depend on how many patients we visualized, and hence set the random seed again right after.
    with logging_section(
            "Visualizing the effect of sampling random crops for training"):
        visualize_random_crops_for_dataset(config)
    ml_util.set_random_seed(config.get_effective_random_seed(),
                            "Model training")

    logging.debug("Creating the PyTorch model.")

    # Create the train loader and validation loader to load images from the dataset
    data_loaders = config.create_data_loaders()

    # Get the path to the checkpoint to recover from
    checkpoint_path = checkpoint_handler.get_recovery_path_train()

    models_and_optimizer = ModelAndInfo(
        config=config,
        model_execution_mode=ModelExecutionMode.TRAIN,
        checkpoint_path=checkpoint_path)

    # Create the main model
    # If continuing from a previous run at a specific epoch, then load the previous model.
    model_loaded = models_and_optimizer.try_create_model_and_load_from_checkpoint(
    )
    if not model_loaded:
        raise ValueError(
            "There was no checkpoint file available for the model for given start_epoch {}"
            .format(config.start_epoch))

    # Print out a detailed breakdown of layers, memory consumption and time.
    generate_and_print_model_summary(config, models_and_optimizer.model)

    # Move model to GPU and adjust for multiple GPUs
    models_and_optimizer.adjust_model_for_gpus()

    # Create the mean teacher model and move to GPU
    if config.compute_mean_teacher_model:
        mean_teacher_model_loaded = models_and_optimizer.try_create_mean_teacher_model_load_from_checkpoint_and_adjust(
        )
        if not mean_teacher_model_loaded:
            raise ValueError(
                "There was no checkpoint file available for the mean teacher model "
                f"for given start_epoch {config.start_epoch}")

    # Create optimizer
    models_and_optimizer.create_optimizer()
    if checkpoint_handler.should_load_optimizer_checkpoint():
        optimizer_loaded = models_and_optimizer.try_load_checkpoint_for_optimizer(
        )
        if not optimizer_loaded:
            raise ValueError(
                f"There was no checkpoint file available for the optimizer for given start_epoch "
                f"{config.start_epoch}")

    # Create checkpoint directory for this run if it doesn't already exist
    logging.info(f"Models are saved at {config.checkpoint_folder}")
    if not config.checkpoint_folder.is_dir():
        config.checkpoint_folder.mkdir()

    # Create the SummaryWriters for Tensorboard
    writers = create_summary_writers(config)
    config.create_dataframe_loggers()

    # Create LR scheduler
    l_rate_scheduler = SchedulerWithWarmUp(config,
                                           models_and_optimizer.optimizer)

    # Training loop
    logging.info("Starting training")
    train_results_per_epoch, val_results_per_epoch, learning_rates_per_epoch = [], [], []

    resource_monitor = None
    if config.monitoring_interval_seconds > 0:
        # initialize and start GPU monitoring
        diagnostics_events = config.logs_folder / "diagnostics"
        logging.info(
            f"Starting resource monitor, outputting to {diagnostics_events}")
        resource_monitor = ResourceMonitor(
            interval_seconds=config.monitoring_interval_seconds,
            tensorboard_folder=diagnostics_events)
        resource_monitor.start()

    gradient_scaler = GradScaler(
    ) if config.use_gpu and config.use_mixed_precision else None
    optimal_temperature_scale_values = []
    for epoch in config.get_train_epochs():
        logging.info("Starting epoch {}".format(epoch))
        save_epoch = config.should_save_epoch(
            epoch) and models_and_optimizer.optimizer is not None

        # store the learning rates used for each epoch
        epoch_lrs = l_rate_scheduler.get_last_lr()
        learning_rates_per_epoch.append(epoch_lrs)

        train_val_params: TrainValidateParameters = \
            TrainValidateParameters(data_loader=data_loaders[ModelExecutionMode.TRAIN],
                                    model=models_and_optimizer.model,
                                    mean_teacher_model=models_and_optimizer.mean_teacher_model,
                                    epoch=epoch,
                                    optimizer=models_and_optimizer.optimizer,
                                    gradient_scaler=gradient_scaler,
                                    epoch_learning_rate=epoch_lrs,
                                    summary_writers=writers,
                                    dataframe_loggers=config.metrics_data_frame_loggers,
                                    in_training_mode=True)
        training_steps = create_model_training_steps(config, train_val_params)
        train_epoch_results = train_or_validate_epoch(training_steps)
        train_results_per_epoch.append(train_epoch_results.metrics)

        metrics.validate_and_store_model_parameters(writers.train, epoch,
                                                    models_and_optimizer.model)
        # Run without adjusting weights on the validation set
        train_val_params.in_training_mode = False
        train_val_params.data_loader = data_loaders[ModelExecutionMode.VAL]
        # if temperature scaling is enabled then do not save validation metrics for the checkpoint epochs
        # as these will be re-computed after performing temperature scaling on the validation set.
        if isinstance(config, SequenceModelBase):
            train_val_params.save_metrics = not (
                save_epoch and config.temperature_scaling_config)

        training_steps = create_model_training_steps(config, train_val_params)
        val_epoch_results = train_or_validate_epoch(training_steps)
        val_results_per_epoch.append(val_epoch_results.metrics)

        if config.is_segmentation_model:
            metrics.store_epoch_stats_for_segmentation(
                config.outputs_folder, epoch, epoch_lrs,
                train_epoch_results.metrics, val_epoch_results.metrics)

        if save_epoch:
            # perform temperature scaling if required
            if isinstance(
                    config,
                    SequenceModelBase) and config.temperature_scaling_config:
                optimal_temperature, scaled_val_results = \
                    temperature_scaling_steps(config, train_val_params, val_epoch_results)
                optimal_temperature_scale_values.append(optimal_temperature)
                # overwrite the metrics for the epoch with the metrics from the temperature scaled model
                val_results_per_epoch[-1] = scaled_val_results.metrics

            models_and_optimizer.save_checkpoint(epoch)

        # Updating the learning rate should happen at the end of the training loop, so that the
        # initial learning rate will be used for the very first epoch.
        l_rate_scheduler.step()

    model_training_results = ModelTrainingResults(
        train_results_per_epoch=train_results_per_epoch,
        val_results_per_epoch=val_results_per_epoch,
        learning_rates_per_epoch=learning_rates_per_epoch,
        optimal_temperature_scale_values_per_checkpoint_epoch=
        optimal_temperature_scale_values)

    logging.info("Finished training")

    # Since we have trained the model further, let the checkpoint_handler object know so it can handle
    # checkpoints correctly.
    checkpoint_handler.additional_training_done()

    # Upload visualization directory to AML run context to be able to see it
    # in the Azure UI.
    if config.max_batch_grad_cam > 0 and config.visualization_folder.exists():
        RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER,
                                  path=str(config.visualization_folder))

    writers.close_all()
    config.metrics_data_frame_loggers.close_all()
    if resource_monitor:
        # stop the resource monitoring process
        logging.info(
            "Shutting down the resource monitor process. Aggregate resource utilization:"
        )
        for name, value in resource_monitor.read_aggregate_metrics():
            logging.info(f"{name}: {value}")
            if not is_offline_run_context(RUN_CONTEXT):
                RUN_CONTEXT.log(name, value)
        resource_monitor.kill()

    return model_training_results
コード例 #5
0
def model_train(config: ModelConfigBase,
                checkpoint_handler: CheckpointHandler,
                num_nodes: int = 1) -> ModelTrainingResults:
    """
    The main training loop. It creates the Pytorch model based on the configuration options passed in,
    creates a Pytorch Lightning trainer, and trains the model.
    If a checkpoint was specified, then it loads the checkpoint before resuming training.
    :param config: The arguments which specify all required information.
    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
    :param num_nodes: The number of nodes to use in distributed training.
    """
    # Get the path to the checkpoint to recover from
    checkpoint_path = checkpoint_handler.get_recovery_path_train()
    # This reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder
    # for categorical features, that need to be available before creating the model.
    config.read_dataset_if_needed()

    # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second
    # training in the unit tests.d
    old_environ = dict(os.environ)
    seed_everything(config.get_effective_random_seed())
    trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes)

    logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. "
                 f"trainer.global_rank: {trainer.global_rank}")
    logging.debug("Creating the PyTorch model.")
    lightning_model = create_lightning_model(config)
    lightning_model.storing_logger = storing_logger

    resource_monitor = None
    # Execute some bookkeeping tasks only once if running distributed:
    if is_rank_zero():
        config.write_args_file()
        logging.info(str(config))
        # Save the dataset files for later use in cross validation analysis
        config.write_dataset_files()
        logging.info(f"Model checkpoints are saved at {config.checkpoint_folder}")

        # set the random seed for all libraries
        ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization")
        # Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't
        # want training to depend on how many patients we visualized, and hence set the random seed again right after.
        with logging_section("Visualizing the effect of sampling random crops for training"):
            visualize_random_crops_for_dataset(config)

        # Print out a detailed breakdown of layers, memory consumption and time.
        generate_and_print_model_summary(config, lightning_model.model)

        if config.monitoring_interval_seconds > 0:
            # initialize and start GPU monitoring
            diagnostics_events = config.logs_folder / "diagnostics"
            logging.info(f"Starting resource monitor, outputting to {diagnostics_events}")
            resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds,
                                               tensorboard_folder=diagnostics_events)
            resource_monitor.start()

    # Training loop
    logging.info("Starting training")

    lightning_data = TrainingAndValidationDataLightning(config)  # type: ignore
    # When trying to store the config object in the constructor, it does not appear to get stored at all, later
    # reference of the object simply fail. Hence, have to set explicitly here.
    lightning_data.config = config
    trainer.fit(lightning_model, datamodule=lightning_data)
    trainer.logger.close()  # type: ignore
    lightning_model.close_all_loggers()
    world_size = getattr(trainer, "world_size", 0)
    is_azureml_run = not config.is_offline_run
    # Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
    # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them.
    if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning):
        upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, config.outputs_folder)
        upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, config.outputs_folder)
    # DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training.
    # We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set
    # all necessary properties.
    if lightning_model.global_rank != 0:
        logging.info(f"Terminating training thread with rank {lightning_model.global_rank}.")
        sys.exit()

    logging.info("Choosing the best checkpoint and removing redundant files.")
    cleanup_checkpoint_folder(config.checkpoint_folder)
    # Lightning modifies a ton of environment variables. If we first run training and then the test suite,
    # those environment variables will mislead the training runs in the test suite, and make them crash.
    # Hence, restore the original environment after training.
    os.environ.clear()
    os.environ.update(old_environ)

    if world_size and isinstance(lightning_model, ScalarLightning):
        if is_azureml_run and world_size > 1:
            # In a DDP run on the local box, all ranks will write to local disk, hence no download needed.
            # In a multi-node DDP, each rank would upload to AzureML, and rank 0 will now download all results and
            # concatenate
            for rank in range(world_size):
                for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
                    file = mode.value + "/" + get_subject_output_file_per_rank(rank)
                    RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=config.outputs_folder / file)
        # Concatenate all temporary file per execution mode
        for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
            temp_files = (config.outputs_folder / mode.value).rglob(SUBJECT_OUTPUT_PER_RANK_PREFIX + "*")
            result_file = config.outputs_folder / mode.value / SUBJECT_METRICS_FILE_NAME
            for i, file in enumerate(temp_files):
                temp_file_contents = file.read_text()
                if i == 0:
                    # Copy the first file as-is, including the first line with the column headers
                    result_file.write_text(temp_file_contents)
                else:
                    # For all files but the first one, cut off the header line.
                    result_file.write_text(os.linesep.join(temp_file_contents.splitlines()[1:]))

    model_training_results = ModelTrainingResults(
        train_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values()),
        val_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values()),
        train_diagnostics=lightning_model.train_diagnostics,
        val_diagnostics=lightning_model.val_diagnostics,
        optimal_temperature_scale_values_per_checkpoint_epoch=[]
    )

    logging.info("Finished training")

    # Since we have trained the model further, let the checkpoint_handler object know so it can handle
    # checkpoints correctly.
    checkpoint_handler.additional_training_done()

    # Upload visualization directory to AML run context to be able to see it
    # in the Azure UI.
    if config.max_batch_grad_cam > 0 and config.visualization_folder.exists():
        RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder))

    if resource_monitor:
        # stop the resource monitoring process
        logging.info("Shutting down the resource monitor process. Aggregate resource utilization:")
        for name, value in resource_monitor.read_aggregate_metrics():
            logging.info(f"{name}: {value}")
            if not config.is_offline_run:
                RUN_CONTEXT.log(name, value)
        resource_monitor.kill()

    return model_training_results