Ejemplo n.º 1
0
    def backward(
        self,
        model: LightningModule,
        closure_loss: torch.Tensor,
        optimizer: torch.optim.Optimizer,
        opt_idx: int,
        should_accumulate: bool,
        *args: Any,
        **kwargs: Any,
    ) -> torch.Tensor:
        """performs the actual backpropagation

        Args:
            model: the model to be optimized
            closure_loss: the loss value obtained from the closure
            optimizer: the optimizer to perform the step lateron
            opt_idx: the optimizer's index
            should_accumulate: whether to accumulate gradients or not

        """
        automatic_optimization = model.automatic_optimization

        # do backward pass
        if automatic_optimization:
            model.backward(closure_loss, optimizer, opt_idx)
        else:
            closure_loss.backward(*args, **kwargs)

        # once backward has been applied, release graph
        closure_loss = closure_loss.detach()

        return closure_loss
Ejemplo n.º 2
0
    def __setup_tpu_training(self, model: LightningModule, trainer):
        # use the default device from the process
        # tpu_device = xm.xla_device()

        # if given an ordinal device, use this as the device
        if trainer.tpu_id is not None:
            tpu_device = xm.xla_device(trainer.tpu_id)
        else:
            tpu_device = xm.xla_device()
        # track the device and move model to it
        trainer._device = tpu_device
        model.to(trainer._device)

        # get the appropriate tpu ranks
        trainer.tpu_local_core_rank = xm.get_local_ordinal()
        trainer.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
            trainer.progress_bar_callback.disable()

        trainer.global_rank = trainer.tpu_local_core_rank
        rank_zero_only.rank = trainer.global_rank

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.setup_optimizers(model)

        # init 16 bit for TPU
        if trainer.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},'
                 f' global rank: {trainer.tpu_global_core_rank}'
                 f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
Ejemplo n.º 3
0
    def tpu_train_in_process(self,
                             tpu_core_idx: int,
                             model: LightningModule,
                             trainer=None,
                             mp_queue=None):
        """
        Here we are inside each individual process
        """
        if not trainer:
            trainer = self.trainer
        if not trainer.testing:
            trainer.setup('fit')
            model.setup('fit')

        # setup TPU training
        self.__setup_tpu_training(model, trainer)

        # Run the pretrain routine
        results = trainer.run_pretrain_routine(model)

        # save weights at the end of training
        self.__save_end_of_training_weights(model, trainer)

        # persist info in spawn
        trainer.transfer_distrib_spawn_state_on_fit_end(
            model, mp_queue, results)
Ejemplo n.º 4
0
    def backward(
        self,
        model: LightningModule,
        closure_loss: torch.Tensor,
        optimizer: Optimizer,
        opt_idx: int,
        should_accumulate: bool,
        *args: Any,
        **kwargs: Any,
    ) -> torch.Tensor:
        """performs the actual backpropagation

        Args:
            model: the model to be optimized
            closure_loss: the loss value obtained from the closure
            optimizer: the optimizer to perform the step lateron
            opt_idx: the optimizer's index
            should_accumulate: whether to accumulate gradients or not

        """
        closure_loss = amp.scale_loss(
            closure_loss,
            model.trainer.optimizers if optimizer is None else optimizer)

        # enter apex context
        context = closure_loss
        closure_loss = closure_loss.__enter__()

        # do backward pass
        # TODO: not entirely sure, why we need this
        if model is not None and isinstance(model, LightningModule):
            model.backward(closure_loss, optimizer, opt_idx, **kwargs)

            # TODO: avoid dev_debugger and track these calls with mock
            model.trainer.dev_debugger.track_event('AMP', str(AMPType.APEX))

        else:
            closure_loss.backward(*args, **kwargs)

        # exit amp context
        a, b, c = None, None, None
        error = context.__exit__(a, b, c)
        if error:
            rank_zero_warn(a, b, c)
            raise Exception("apex unscale error")

        # once backward has been applied, release graph
        closure_loss = closure_loss.detach()
        return closure_loss
Ejemplo n.º 5
0
 def _setup_nvidia_apex(self, model: LightningModule):
     model, optimizers = model.configure_apex(amp, model,
                                              self.trainer.optimizers,
                                              self.trainer.amp_level)
     self.trainer.optimizers = optimizers
     self.trainer.reinit_scheduler_properties(self.trainer.optimizers,
                                              self.trainer.lr_schedulers)
     return model