def backward( self, model: LightningModule, closure_loss: torch.Tensor, optimizer: torch.optim.Optimizer, opt_idx: int, should_accumulate: bool, *args: Any, **kwargs: Any, ) -> torch.Tensor: """performs the actual backpropagation Args: model: the model to be optimized closure_loss: the loss value obtained from the closure optimizer: the optimizer to perform the step lateron opt_idx: the optimizer's index should_accumulate: whether to accumulate gradients or not """ automatic_optimization = model.automatic_optimization # do backward pass if automatic_optimization: model.backward(closure_loss, optimizer, opt_idx) else: closure_loss.backward(*args, **kwargs) # once backward has been applied, release graph closure_loss = closure_loss.detach() return closure_loss
def __setup_tpu_training(self, model: LightningModule, trainer): # use the default device from the process # tpu_device = xm.xla_device() # if given an ordinal device, use this as the device if trainer.tpu_id is not None: tpu_device = xm.xla_device(trainer.tpu_id) else: tpu_device = xm.xla_device() # track the device and move model to it trainer._device = tpu_device model.to(trainer._device) # get the appropriate tpu ranks trainer.tpu_local_core_rank = xm.get_local_ordinal() trainer.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: trainer.progress_bar_callback.disable() trainer.global_rank = trainer.tpu_local_core_rank rank_zero_only.rank = trainer.global_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) # init 16 bit for TPU if trainer.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},' f' global rank: {trainer.tpu_global_core_rank}' f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, trainer=None, mp_queue=None): """ Here we are inside each individual process """ if not trainer: trainer = self.trainer if not trainer.testing: trainer.setup('fit') model.setup('fit') # setup TPU training self.__setup_tpu_training(model, trainer) # Run the pretrain routine results = trainer.run_pretrain_routine(model) # save weights at the end of training self.__save_end_of_training_weights(model, trainer) # persist info in spawn trainer.transfer_distrib_spawn_state_on_fit_end( model, mp_queue, results)
def backward( self, model: LightningModule, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int, should_accumulate: bool, *args: Any, **kwargs: Any, ) -> torch.Tensor: """performs the actual backpropagation Args: model: the model to be optimized closure_loss: the loss value obtained from the closure optimizer: the optimizer to perform the step lateron opt_idx: the optimizer's index should_accumulate: whether to accumulate gradients or not """ closure_loss = amp.scale_loss( closure_loss, model.trainer.optimizers if optimizer is None else optimizer) # enter apex context context = closure_loss closure_loss = closure_loss.__enter__() # do backward pass # TODO: not entirely sure, why we need this if model is not None and isinstance(model, LightningModule): model.backward(closure_loss, optimizer, opt_idx, **kwargs) # TODO: avoid dev_debugger and track these calls with mock model.trainer.dev_debugger.track_event('AMP', str(AMPType.APEX)) else: closure_loss.backward(*args, **kwargs) # exit amp context a, b, c = None, None, None error = context.__exit__(a, b, c) if error: rank_zero_warn(a, b, c) raise Exception("apex unscale error") # once backward has been applied, release graph closure_loss = closure_loss.detach() return closure_loss
def _setup_nvidia_apex(self, model: LightningModule): model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) return model