def backward( self, closure_loss: torch.Tensor, optimizer: torch.optim.Optimizer, opt_idx: int, should_accumulate: bool, *args, **kwargs, ) -> torch.Tensor: """Forwards backward-calls to the precision plugin. Args: closure_loss: a tensor holding the loss value to backpropagate optimizer: the optimizer to do the step later on. opt_idx: the index of the optimizer should_accumulate: whether to accumulate gradients """ output = self.precision_plugin.backward( self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs ) # TODO: this is a hack, find a better solution for this (hook?) if isinstance(self.training_type_plugin, HorovodPlugin): optimizer.synchronize() return output
def _step_optimizer( self, optimizer: torch.optim.Optimizer, # type: ignore clip_grads: Optional[Callable[[Iterator], None]] = None, ) -> None: """ Perform a single optimization step. This function must be called once for each optimizer. However, the order of different optimizers' optimization steps can be specified by calling this function in different orders. Also, gradient accumulation across iterations is performed by the Determined training loop by setting the experiment configuration optimization.aggregation_frequency. Arguments: optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped. clip_grads(a function, optional): This function should have one argument for parameters in order to clip the gradients """ if self._should_communicate_and_update(): # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if self.hvd_config.use and not self._use_amp: optimizer.synchronize() parameters = ([ p for group in optimizer.param_groups for p in group.get("params", []) ] if not self._use_amp else apex.amp.master_params(optimizer)) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency) if clip_grads is not None: clip_grads(parameters) if self.hvd_config.use: with optimizer.skip_synchronize(): optimizer.step() else: optimizer.step() optimizer.zero_grad()
def step_optimizer( self, optimizer: torch.optim.Optimizer, clip_grads: Optional[Callable[[Iterator], None]] = None, auto_zero_grads: bool = True, scaler: Optional[Any] = None, # Should be torch.cuda.amp.GradScaler, but: # * other implementations might be possible # * requiring this type forces upgrades to PyTorch 1.6+ ) -> None: """ Perform a single optimization step. This function must be called once for each optimizer. However, the order of different optimizers' steps can be specified by calling this function in different orders. Also, gradient accumulation across iterations is performed by the Determined training loop by setting the experiment configuration field :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`. Here is a code example: .. code-block:: python def clip_grads(params): torch.nn.utils.clip_grad_norm_(params, 0.0001), self.context.step_optimizer(self.opt1, clip_grads) Arguments: optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped. clip_grads(a function, optional): This function should have one argument for parameters in order to clip the gradients. auto_zero_grads(bool, optional): Automatically zero out gradients automatically after stepping the optimizer. If false, you need to call ``optimizer.zero_grad()`` manually. Note that if :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true. scaler(``torch.cuda.amp.GradScaler``, optional): The scaler to use for stepping the optimizer. This should be unset if not using AMP, and is necessary if ``wrap_scaler()`` was called directly. """ check.true( auto_zero_grads or self._aggregation_frequency == 1, "if optimizations.aggregation_frequency is larger than 1, " "you can only set auto_zero_grads to be true. ", ) if not self._should_communicate_and_update(): return # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. # In the case of APEX this is called in backward() instead, so that it's inside the context # manager and before unscaling. # In the case of PyTorch DDP, losses are synchronized during the backwards() pass. if (self.distributed.size > 1 and self._distributed_backend.use_horovod() and not self._use_apex): with self._record_timing("train_batch.sync_optimizers", accumulate=True): optimizer.synchronize() # type: ignore parameters = ([ p for group in optimizer.param_groups for p in group.get("params", []) ] if not self._use_apex else apex.amp.master_params(optimizer)) if self._average_aggregated_gradients: self._average_gradients(parameters=parameters, divisor=self._aggregation_frequency) if clip_grads is not None: if self._scaler and self.experimental._auto_amp: self._scaler.unscale_(optimizer) clip_grads(parameters) # For stepping the optimizer we will operate on the scaler passed # in, or fall back to the wrapped scaler (if any). if scaler is None and self.experimental._auto_amp: scaler = self._scaler if scaler: def step_fn() -> None: scaler.step(optimizer) # type: ignore else: step_fn = optimizer.step # type: ignore # In the case of PyTorch DDP, losses are synchronized automatically on the backwards() pass if self.distributed.size > 1 and self._distributed_backend.use_horovod( ): with optimizer.skip_synchronize(): # type: ignore step_fn() else: step_fn() if auto_zero_grads: optimizer.zero_grad()
def step_optimizer( self, optimizer: torch.optim.Optimizer, clip_grads: Optional[Callable[[Iterator], None]] = None, auto_zero_grads: bool = True, ) -> None: """ Perform a single optimization step. This function must be called once for each optimizer. However, the order of different optimizers' steps can be specified by calling this function in different orders. Also, gradient accumulation across iterations is performed by the Determined training loop by setting the experiment configuration field :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`. Here is a code example: .. code-block:: python def clip_grads(params): torch.nn.utils.clip_grad_norm_(params, 0.0001), self.context.step_optimizer(self.opt1, clip_grads) Arguments: optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped. clip_grads(a function, optional): This function should have one argument for parameters in order to clip the gradients. auto_zero_grads(bool, optional): Automatically zero out gradients automatically after stepping the optimizer. If false, you need to call ``optimizer.zero_grad()`` manually. Note that if :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true. """ check.true( auto_zero_grads or self.hvd_config.aggregation_frequency > 1, "if optimizations.aggregation_frequency is larger than 1, " "you can only set auto_zero_grads to be true. ", ) if self._should_communicate_and_update(): # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if self.hvd_config.use and not self._use_amp: optimizer.synchronize() # type: ignore parameters = ( [p for group in optimizer.param_groups for p in group.get("params", [])] if not self._use_amp else apex.amp.master_params(optimizer) ) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency ) if clip_grads is not None: clip_grads(parameters) if self.hvd_config.use: with optimizer.skip_synchronize(): # type: ignore optimizer.step() else: optimizer.step() if auto_zero_grads: optimizer.zero_grad()