Example #1
0
def forward_preserve_state(module: DeviceAwareModule,
                           inputs: List[torch.Tensor]) -> torch.Tensor:
    """
    Perform forward pass on input module with given list of torch tensors. The function preserves the random state
    of the backend libraries to avoid reproducibility issues. Additionally, it temporarily sets the model in
    evaluation mode for inference and then restores its previous state.
    :param module: Callable torch module
    :param inputs: List of input torch tensors
    :return output: Output torch tensors
    """
    if not isinstance(inputs, list):
        raise RuntimeError("Inputs object has to be a list of torch tensors")

    if module.is_model_on_gpu():
        inputs = [input_tensor.cuda() for input_tensor in inputs]

    # collect the current state of the model
    module_state = RandomStateSnapshot.snapshot_random_state()

    with set_model_to_eval_mode(module):
        with torch.no_grad():
            output = module.forward(*inputs)

    # restore the seed for torch and numpy
    module_state.restore_random_state()

    return output
def summary_for_segmentation_models(config: ModelConfigBase, model: DeviceAwareModule) -> None:
    """
    Generates a human readable summary of the present segmentation model, writes it to logging.info, and
    stores the ModelSummary object inside the argument `model`.

    :param config: The configuration for the model.
    :param model: The instantiated Pytorch model.
    """
    assert isinstance(model, BaseSegmentationModel)
    crop_size = config.crop_size
    if isinstance(crop_size, int):
        crop_size = (crop_size, crop_size, crop_size)
    try:
        model.generate_model_summary(crop_size, log_summaries_to_files=config.log_summaries_to_files)
    except AttributeError as e:
        logging.warning(f"summary_for_segmentation_models failed with exception {e}")
 def __init__(self, config: DeepLearningConfig, *args: Any,
              **kwargs: Any) -> None:
     super().__init__(*args, **kwargs)
     self.outputs_folder = config.outputs_folder
     self.checkpoint_folder = config.checkpoint_folder
     self.model: DeviceAwareModule = DeviceAwareModule()
     # These two will be set later in set_optimizer_and_scheduler.
     # The ddp_spawn accelerator only works if the model configuration object is
     # not stored in here. Hence, need to do operations that require a full config
     # in a way that does not require storing the config.
     self.optimizer: Optional[Optimizer] = None
     self.l_rate_scheduler: Optional[_LRScheduler] = None
     self.cross_validation_split_index = config.cross_validation_split_index
     self.effective_random_seed = config.get_effective_random_seed()
     # This should be re-assigned on the outside, to a logger that is hooked up with the Trainer object.
     self.storing_logger = StoringLogger()
     # This will be initialized correctly in epoch_start
     self.random_state: Optional[RandomStateSnapshot] = None
     # training loggers
     self.train_metrics_folder = self.outputs_folder / ModelExecutionMode.TRAIN.value
     self.val_metrics_folder = self.outputs_folder / ModelExecutionMode.VAL.value
     fixed_logger_columns = {
         LoggingColumns.CrossValidationSplitIndex.value:
         config.cross_validation_split_index
     }
     self.train_epoch_metrics_logger = DataframeLogger(
         self.train_metrics_folder / EPOCH_METRICS_FILE_NAME,
         fixed_columns=fixed_logger_columns)
     self.val_epoch_metrics_logger = DataframeLogger(
         self.val_metrics_folder / EPOCH_METRICS_FILE_NAME,
         fixed_columns=fixed_logger_columns)
     # Stores information the checkpoint that created this model, if any.
     self.checkpoint_loading_message = ""
def generate_and_print_model_summary(config: ModelConfigBase, model: DeviceAwareModule) -> None:
    """
    Writes a human readable summary of the present model to logging.info, and logs the number of trainable
    parameters to AzureML.

    :param config: The configuration for the model.
    :param model: The instantiated Pytorch model.
    """
    random_state = RandomStateSnapshot.snapshot_random_state()
    # There appears to be a bug in apex, where previous use (in training for example) causes problems
    # when another model is later built on the CPU (for example, before loading from a checkpoint)
    # https://github.com/NVIDIA/apex/issues/694
    # Hence, move the model to the GPU before doing model summary.
    if config.use_gpu:
        model = model.cuda()
    if isinstance(config, ScalarModelBase):
        # To generate the model summary, read the first item of the dataset. Then use the model's own
        # get_model_input function to convert the dataset item to input tensors, and feed them through the model.
        train_dataset = config.get_torch_dataset_for_inference(ModelExecutionMode.TRAIN)
        train_item_0 = next(iter(train_dataset.as_data_loader(shuffle=False, batch_size=1, num_dataload_workers=0)))
        model_inputs = get_scalar_model_inputs_and_labels(config, model, train_item_0).model_inputs
        # The model inputs may already be converted to float16, assuming that we would do mixed precision.
        # However, the model is not yet converted to float16 when this function is called, hence convert back to float32
        summary = ModelSummary(model)
        summary.generate_summary(input_tensors=model_inputs, log_summaries_to_files=config.log_summaries_to_files)
    elif config.is_segmentation_model:
        summary_for_segmentation_models(config, model)
        assert model.summarizer
        summary = model.summarizer  # type: ignore
    else:
        raise ValueError("Don't know how to generate a summary for this type of model?")
    RUN_CONTEXT.log(LoggingColumns.NumTrainableParameters, summary.n_trainable_params)
    random_state.restore_random_state()
    def _load_checkpoint(cls, model: DeviceAwareModule, checkpoint_path: Path,
                         key_in_state_dict: str, use_gpu: bool) -> int:
        """
        Loads a checkpoint of a model, may be the model or the mean teacher model. Assumes the model
        has already been created, and the checkpoint exists. This does not set checkpoint epoch.
        This method should not be called externally. Use instead try_load_checkpoint_for_model
        or try_load_checkpoint_for_mean_teacher_model
        :param model: model to load weights
        :param checkpoint_path: Path to checkpoint
        :param key_in_state_dict: the key for the model weights in the checkpoint state dict
        :param reader: Function which takes the path and returns a dict with model and optimizer states
        :return checkpoint epoch from the state dict
        """
        logging.info(f"Loading checkpoint {checkpoint_path}")
        checkpoint = ModelAndInfo.read_checkpoint(checkpoint_path, use_gpu)

        try:
            state_dict = checkpoint[key_in_state_dict]
        except KeyError:
            logging.error(f"Key {key_in_state_dict} not found in checkpoint")
            return False

        if isinstance(model, torch.nn.DataParallel):
            result = model.module.load_state_dict(state_dict, strict=False)
        else:
            result = model.load_state_dict(state_dict, strict=False)

        if result.missing_keys:
            logging.warning(f"Missing keys in model checkpoint: {result.missing_keys}")
        if result.unexpected_keys:
            logging.warning(f"Unexpected keys in model checkpoint: {result.unexpected_keys}")

        return checkpoint[ModelAndInfo.EPOCH_KEY]
Example #6
0
 def _get_parameters_of_model(model: DeviceAwareModule) -> Any:
     """
     Returns the iterator of model parameters
     """
     if isinstance(model, DataParallelModel):
         return model.module.parameters()
     else:
         return model.parameters()
    def __init__(self, model: DeviceAwareModule, model_config: ScalarModelBase,
                 epoch: int, pipeline_id: int) -> None:
        """
        :param model: Model recovered from the checkpoint.
        :param model_config: Model configuration information.
        :param epoch: Epoch of the checkpoint which was recovered.
        :param pipeline_id: ID for this pipeline (useful for ensembles).
        :return:
        """
        super().__init__(model_config)
        self.model = model
        self.epoch = epoch
        self.pipeline_id = pipeline_id

        # Switch model to evaluation mode (if not, results will be different from what we got during training,
        # because batchnorm operates differently).
        model.eval()
    def _adjust_for_gpus(
            cls, model: DeviceAwareModule, config: ModelConfigBase,
            model_execution_mode: ModelExecutionMode) -> DeviceAwareModule:
        """
        Updates a torch model so that input mini-batches are parallelized across the batch dimension to utilise
        multiple gpus. If model parallel is set to True and execution is in test mode, then model is partitioned to
        perform full volume inference.
        This assumes the model has been created, that the optimizer has not yet been created, and the the model has not
        been adjusted twice. This method should not be called externally. Use instead adjust_model_for_gpus
        or adjust_mean_teacher_model_for_gpus
        :returns Adjusted model
        """
        if config.use_gpu:
            model = model.cuda()
            logging.info(
                "Adjusting the model to use mixed precision training.")
            # If model parallel is set to True, then partition the network across all available gpus.
            if config.use_model_parallel:
                devices = config.get_cuda_devices()
                assert devices is not None  # for mypy
                model.partition_model(devices=devices)  # type: ignore
        else:
            logging.info(
                "Making no adjustments to the model because no GPU was found.")

        # Update model related config attributes (After Model Parallel Activated)
        config.adjust_after_mixed_precision_and_parallel(model)

        # DataParallel enables running the model with multiple gpus by splitting samples across GPUs
        # If the model is used in training mode, data parallel is activated by default.
        # Similarly, if model parallel is not activated, data parallel is used as a backup option
        use_data_parallel = (model_execution_mode == ModelExecutionMode.TRAIN
                             ) or (not config.use_model_parallel)
        if config.use_gpu and use_data_parallel:
            logging.info("Adjusting the model to use DataParallel")
            # Move all layers to the default GPU before activating data parallel.
            # This needs to happen even though we put the model to the GPU at the beginning of the method,
            # but we may have spread it across multiple GPUs later.
            model = model.cuda()
            model = DataParallelModel(model,
                                      device_ids=config.get_cuda_devices())

        return model
Example #9
0
    def __init__(self, model: DeviceAwareModule,
                 temperature_scaling_config: TemperatureScalingConfig):
        super().__init__()
        self.model = model
        self.conv_in_3d = model.conv_in_3d
        self.temperature_scaling_config = temperature_scaling_config

        # assign this parameter to the first model device otherwise use PyTorch default.
        _model_devices = model.get_devices()
        _device = _model_devices[0] if _model_devices else None
        self.temperature = torch.nn.Parameter(torch.ones(1, device=_device), requires_grad=True)
Example #10
0
def validate_and_store_model_parameters(writer: tensorboardX.SummaryWriter, epoch: int,
                                        model: DeviceAwareModule) -> None:
    """
    Validates and writes all model weights to the given TensorBoard writer.
    :param writer: TensorBoard summary writer
    :param epoch: The epoch for which these model parameters correspond to.
    :param model: The model from which to extract the parameters.
    :return:
    """
    for name, param in model.named_parameters():
        param_numpy = param.clone().cpu().data.numpy()
        check_array_range(param_numpy, error_prefix="Parameter {}".format(name))
        writer.add_histogram(name, param_numpy, epoch)
    def _load_checkpoint(cls, model: DeviceAwareModule, checkpoint_path: Path,
                         key_in_state_dict: str, use_gpu: bool) -> int:
        """
        Loads a checkpoint of a model, may be the model or the mean teacher model. Assumes the model
        has already been created, and the checkpoint exists. This does not set checkpoint epoch.
        This method should not be called externally. Use instead try_load_checkpoint_for_model
        or try_load_checkpoint_for_mean_teacher_model
        :param model: model to load weights
        :param key_in_state_dict: the key for the model weights in the checkpoint state dict
        :return checkpoint epoch form the state dict
        """
        logging.info(f"Loading checkpoint {checkpoint_path}")
        # For model debugging, allow loading a GPU trained model onto the CPU. This will clearly only work
        # if the model is small.
        map_location = None if use_gpu else 'cpu'
        checkpoint = torch.load(str(checkpoint_path),
                                map_location=map_location)

        if isinstance(model, torch.nn.DataParallel):
            model.module.load_state_dict(checkpoint[key_in_state_dict])
        else:
            model.load_state_dict(checkpoint[key_in_state_dict])
        return checkpoint[ModelAndInfo.EPOCH_KEY]
Example #12
0
    def __init__(self, model: DeviceAwareModule) -> None:
        """
        Class to summarise the detail of neural network including (I) intermediate tensor shapes,
        (II) number of trainable and non-trainable parameters, and (III) total GPU/CPU memory requirements.
        :param model: BaseModel object containing the computation graph.
        """
        # Need a local import here to avoid circular dependency
        from InnerEye.ML.models.architectures.base_model import DeviceAwareModule
        if not isinstance(model, DeviceAwareModule):
            raise ValueError(
                "Input model should be an instance of the DeviceAwareModule class"
            )
        self.use_gpu = model.is_model_on_gpu()
        self.summary: OrderedDict = OrderedDict()
        self.hooks: List[RemovableHandle] = list()

        # Generate a copy to shield the model from torch-profiler hooks.
        self.n_params = 0
        self.n_trainable_params = 0
        self.model = copy.deepcopy(model)