def update_model_for_multiple_gpus(model_and_info: ModelAndInfo,
                                   args: ModelConfigBase,
                                   execution_mode: ModelExecutionMode = ModelExecutionMode.TRAIN) -> \
        ModelAndInfo:
    """
    Updates a given torch model as such input mini-batches are parallelized across the batch dimension to utilise
    multiple gpus. If model parallel is set to True and execution is in test mode, then model is partitioned to
    perform full volume inference.
    :param model_and_info: The torch module object representing the network and the optimizer.
    :param args: The arguments object with attributes used to enable amp training and create the parallel model.
    :param execution_mode: mode, i.e. train or test
    :return: Updated torch model and optimizer.
    """
    if model_and_info.is_adjusted:
        logging.debug("model_and_info.is_adjusted is already True")
        return model_and_info
    if args.use_gpu:
        # In the normal training codepath, the model should already be on the GPU, but in some tests not.
        model_and_info.to_cuda()
        logging.info("Adjusting the model to use mixed precision training.")
        # If model parallel is set to True, then partition the network across all available gpus.
        if args.use_model_parallel:
            devices = args.get_cuda_devices()
            assert devices is not None  # for mypy
            model_and_info.model.partition_model(
                devices=devices)  # type: ignore
    else:
        logging.info(
            "Making no adjustments to the model because no GPU was found.")

    # Update model related config attributes (After Model Parallel Activated)
    args.adjust_after_mixed_precision_and_parallel(model_and_info.model)

    # DataParallel enables running the model with multiple gpus by splitting samples across GPUs
    # If the model is used in training mode, data parallel is activated by default.
    # Similarly, if model parallel is not activated, data parallel is used as a backup option
    use_data_parallel = (execution_mode == ModelExecutionMode.TRAIN) or (
        not args.use_model_parallel)
    if args.use_gpu and use_data_parallel:
        logging.info("Adjusting the model to use DataParallel")
        # Move all layers to the default GPU before activating data parallel.
        # This needs to happen even though we put the model to the GPU at the beginning of the method,
        # but we may have spread it across multiple GPUs later.
        model_and_info.to_cuda()
        model_and_info.set_data_parallel(device_ids=args.get_cuda_devices())

    model_and_info.is_adjusted = True
    logging.debug("model_and_info.is_adjusted set to True")
    return model_and_info
    def _adjust_for_gpus(
            cls, model: DeviceAwareModule, config: ModelConfigBase,
            model_execution_mode: ModelExecutionMode) -> DeviceAwareModule:
        """
        Updates a torch model so that input mini-batches are parallelized across the batch dimension to utilise
        multiple gpus. If model parallel is set to True and execution is in test mode, then model is partitioned to
        perform full volume inference.
        This assumes the model has been created, that the optimizer has not yet been created, and the the model has not
        been adjusted twice. This method should not be called externally. Use instead adjust_model_for_gpus
        or adjust_mean_teacher_model_for_gpus
        :returns Adjusted model
        """
        if config.use_gpu:
            model = model.cuda()
            logging.info(
                "Adjusting the model to use mixed precision training.")
            # If model parallel is set to True, then partition the network across all available gpus.
            if config.use_model_parallel:
                devices = config.get_cuda_devices()
                assert devices is not None  # for mypy
                model.partition_model(devices=devices)  # type: ignore
        else:
            logging.info(
                "Making no adjustments to the model because no GPU was found.")

        # Update model related config attributes (After Model Parallel Activated)
        config.adjust_after_mixed_precision_and_parallel(model)

        # DataParallel enables running the model with multiple gpus by splitting samples across GPUs
        # If the model is used in training mode, data parallel is activated by default.
        # Similarly, if model parallel is not activated, data parallel is used as a backup option
        use_data_parallel = (model_execution_mode == ModelExecutionMode.TRAIN
                             ) or (not config.use_model_parallel)
        if config.use_gpu and use_data_parallel:
            logging.info("Adjusting the model to use DataParallel")
            # Move all layers to the default GPU before activating data parallel.
            # This needs to happen even though we put the model to the GPU at the beginning of the method,
            # but we may have spread it across multiple GPUs later.
            model = model.cuda()
            model = DataParallelModel(model,
                                      device_ids=config.get_cuda_devices())

        return model