def backward( self, loss: torch.Tensor, optimizer: torch.optim.Optimizer, scheduler, step: bool, ): """Handling back-propagation.""" if self.cfg.fp16: from apex import amp with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if self.cfg.train.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.cfg.train.max_grad_norm ) else: loss.backward() if self.cfg.train.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.cfg.train.max_grad_norm ) if step: optimizer.step() scheduler.step() self.model.zero_grad()
def optimizer_step_spottune(optimizer_main: Optimizer, optimizer_policy: Optimizer, loss: torch.Tensor, **params) -> torch.Tensor: """ Performs the backward pass with respect to ``loss``, as well as a gradient step. ``params`` is used to change the optimizer's parameters. Examples -------- >>> optimizer = Adam(model.parameters(), lr=1) >>> optimizer_step(optimizer, loss) # perform a gradient step >>> optimizer_step(optimizer, loss, lr=1e-3) # set lr to 1e-3 and perform a gradient step >>> optimizer_step(optimizer, loss, betas=(0, 0)) # set betas to 0 and perform a gradient step Notes ----- The incoming ``optimizer``'s parameters are not restored to their original values. """ lr_main, lr_policy = params['lr_main'], params['lr_policy'] set_params(optimizer_main, lr=lr_main) set_params(optimizer_policy, lr=lr_policy) optimizer_main.zero_grad() optimizer_policy.zero_grad() loss.backward() optimizer_main.step() optimizer_policy.step() return loss
def backward(self, loss:torch.Tensor, multi_optim:MultiOptim)->None: if self.is_mixed(): optim = self._get_optim(multi_optim) with self._amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward()
def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **kwargs: Any) -> None: """Lightning-independent backward logic. Currently only used by Lightning Lite. Subject to further refactors. """ tensor.backward(*args, **kwargs)
def policy_svg(policy: TVLinearPolicy, value: Tensor) -> lqr.Linear: """Computes the policy SVG from the estimated return.""" # pylint:disable=invalid-name policy.zero_grad(set_to_none=True) value.backward() K, k = policy.standard_form() return K.grad.clone(), k.grad.clone()
def _backpropagate(self, loss: torch.Tensor): self.optimizer.zero_grad() loss.backward() if self.gradient_clipping_value is not None: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_clipping_value) self.optimizer.step()
def backward_step(self, model: nn.Module, loss: torch.Tensor, optimizer: optim.Optimizer, scaler: amp.GradScaler): if optimizer is None: return loss = loss / self.steps if scaler is not None: loss = scaler.scale(loss) if self.is_start_cycle: optimizer.zero_grad() if isinstance( model, nn.parallel.DistributedDataParallel) and not self.is_end_cycle: with model.no_sync(): loss.backward() else: loss.backward() if self.is_end_cycle: if scaler is None: optimizer.step() else: scaler.step(optimizer) scaler.update() self.inc_counter()
def _sp_double_backward_update(pos_out: Tensor, neg_out: Tensor, param: Parameter, gamma: float, l1_reg: float, l2_reg: float, pos: Tensor = None): param.grad = None # first backward neg_out.backward() neg = param.grad.relu_().add_(eps) if pos is None: param.grad = None pos_out.backward() pos = param.grad.relu_().add_(eps) if l1_reg > 0: pos.add_(l1_reg) if l2_reg > 0: pos = pos.add(param.data, alpha=l2_reg) multiplier = neg.div_(pos) if gamma != 1: multiplier.pow_(gamma) param.data.mul_(multiplier)
def backprop(loss: torch.Tensor, model: torch.nn.Module, optimizer): optimizer.zero_grad() loss.backward() # for i, param in enumerate(model.parameters()): # print(param.shape) # param.grad.data.clamp_(-1, 1) optimizer.step()
def step(iteration: int, loss: Tensor, optimizer: Adam, scheduler: ExponentialLR) -> None: """Do one training step.""" optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step(iteration)
def optimize_agent_network(loss: Tensor) -> None: # since PPO optimizes multiple times per batch, logging is done in # a separate function, log_ppo_stats agent_optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent_network.parameters(), max_norm=0.5) agent_optimizer.step()
def update_action_values(self, guess: torch.Tensor, error, state_hash): # error = torch.from_numpy(error).float() # initial_gradient = torch.randn([1, 1, 3, 3]) # guess.retain_grad() # guess.backward(initial_gradient) ## We now have the grad in each layer # torch.nn.utils.clip_grad_value_(self.conv1.weight, 0.1) # torch.nn.utils.clip_grad_value_(self.conv2.weight, 0.1) # torch.nn.utils.clip_grad_value_(self.conv3.weight, 0.1) # torch.nn.utils.clip_grad_value_(self.conv_final.weight, 0.1) initial_gradient = torch.ones([1, 1, 3, 3]).to('cuda') guess.retain_grad() guess.backward(initial_gradient, retain_graph=True) for name, parameter in self.named_parameters(): if not parameter.requires_grad: continue if state_hash not in self.e[name]: self.e[name][state_hash] = 0 for state in self.e[name].keys(): self.e[name][state] = self.e[name][state] * self.lambd * self.gamma self.e[name][state_hash] += parameter.grad for state in self.e[name].keys(): parameter.data -= self.alpha * error * self.e[name][state] parameter.grad.zero_()
def compute_grad_receptive_field(mod: ModuleInfo, input: Tensor, output: Tensor, infos: ModuleInfoIndex) -> Tuple[int, ...]: if not isinstance(output, Tensor): # We cannot backward() from this. Skip, we just won't be able # to compute the receptive field here without some other # affordance to combine the tensors in this output into a # "loss" return (-1, ) # https://github.com/rogertrullo/Receptive-Field-in-Pytorch/blob/master/Receptive_Field.ipynb fake_grad = torch.zeros(output.shape) # batch=0, channel=0 center_pos = (0, 0, *[i // 2 for i in output.shape[2:]]) fake_grad[center_pos] = 1 # zero_grad everything before and including this module, since backward() accumulates for i in range(0, mod.input_order + 1): infos.by_input_order[i].module.zero_grad() # retain_graph so we can run this multiple times and not drop the # intermediate results from forward() output.backward(gradient=fake_grad, retain_graph=True) # Find the extent of pixels affected; drop batch/channel nonzero_idxs = input.grad.nonzero(as_tuple=True)[2:] rf_dims = [(d.max() - d.min() + 1).item() for d in nonzero_idxs] return tuple(rf_dims)
def backward(self, trainer, loss: Tensor, optimizer: Optimizer, optimizer_idx: int) -> None: """Override backward with your own implementation if you need to :param trainer: Pointer to the trainer :param loss: Loss is already scaled by accumulated grads :param optimizer: Current optimizer being used :param optimizer_idx: Index of the current optimizer being used Called to perform backward step. Feel free to override as needed. The loss passed in has already been scaled for accumulated gradients if requested. .. code-block:: python def backward(self, use_amp, loss, optimizer): if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() """ if trainer.precision == 16: # .backward is not special on 16-bit with TPUs if not trainer.on_tpu: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward()
def _backprop_step(self, loss: Tensor) -> None: # Clean gradients self.optimizer.zero_grad() # Backpropate the loss loss.backward() # Update the params self.optimizer.step()
def after_loss_fn_new(_input: torch.Tensor, _label: torch.Tensor, _output: torch.Tensor, loss: torch.Tensor, optimizer: Optimizer, loss_fn: Callable[..., torch.Tensor] = None, amp: bool = False, scaler: torch.cuda.amp.GradScaler = None, **kwargs): noise = torch.zeros_like(_input) adv_loss_fn = functools.partial(self.adv_loss, _label=_label) for m in range(self.pgd.iteration): if amp: scaler.step(optimizer) scaler.update() else: optimizer.step() self.eval() adv_x, _ = self.pgd.optimize(_input=_input, noise=noise, loss_fn=adv_loss_fn, iteration=1, epsilon=adv_train_epsilon) self.train() loss = loss_fn(adv_x, _label) if callable(after_loss_fn_old): after_loss_fn_old(_input=_input, _label=_label, _output=_output, loss=loss, optimizer=optimizer, loss_fn=loss_fn, amp=amp, scaler=scaler, **kwargs) if amp: scaler.scale(loss).backward() else: loss.backward()
def backward( self, model: 'LightningModule', closure_loss: torch.Tensor, optimizer: 'Optimizer', opt_idx: int, should_accumulate: bool, *args: Any, **kwargs: Any, ) -> torch.Tensor: """performs the actual backpropagation Args: model: the model to be optimized closure_loss: the loss value obtained from the closure optimizer: the optimizer to perform the step lateron opt_idx: the optimizer's index should_accumulate: whether to accumulate gradients or not """ automatic_optimization = model.automatic_optimization # do backward pass if automatic_optimization: model.backward(closure_loss, optimizer, opt_idx) else: closure_loss.backward(*args, **kwargs) # once backward has been applied, release graph closure_loss = closure_loss.detach() return closure_loss
def update_gradient(self, loss: torch.Tensor): """ 1. back-propagation using auto-grad backward() 2. update model using optimizer step :param loss: loss tensor from criterion output """ loss.backward() self.optimizer.step()
def update_networks_on_loss(loss: torch.Tensor, *networks) -> None: if not loss: return for network in networks: network.zero_grad() loss.backward() for network in networks: network.optimizer.step()
def minimize( self, loss: torch.Tensor, optimizer_name: str = "primary", *, retain_graph: bool = False, checkpoint_interval: Optional[float] = None, clip_grad_max_norm: Optional[float] = None, ) -> None: """Compute gradients and use them to minimize a loss function.""" model = cast("Buddy", self)._model assert model is not None, "No model attached!" # Get optimizer self._instantiate_optimizer(optimizer_name) optimizer: torch.optim.Optimizer = self._optimizer_dict[optimizer_name] # Update learning rate using scheduler if possible schedulers = self._optimizer_config.learning_rate_schedulers if optimizer_name in schedulers: self._set_learning_rate( schedulers[optimizer_name](self._optimizer_config.global_steps), optimizer_name, ) # Take gradient step optimizer.zero_grad() loss.backward(retain_graph=retain_graph) # type: ignore if clip_grad_max_norm is not None: torch.nn.utils.clip_grad_norm_( optimizer.param_groups[0]["params"], max_norm=clip_grad_max_norm, ) optimizer.step() # Update global step count self._optimizer_config.global_steps += 1 # Autocheckpoint procedure if checkpoint_interval is None: checkpoint_interval = self._optimizer_checkpoint_interval # Disable autocheckpoint if interval is 0 if checkpoint_interval == 0: return if self._optimizer_last_checkpoint_time is None: # First iteration self._optimizer_last_checkpoint_time = time.time() elif ( time.time() - cast(float, self._optimizer_last_checkpoint_time) > self._optimizer_checkpoint_interval ): # pragma: no cover # Checkpoint! cast("_BuddyCheckpointing", self).save_checkpoint() self._optimizer_last_checkpoint_time = time.time()
def _get_gradient_from_torch(self, f: torch.Tensor): """ Get the gradient of f w.r.t. the policy's parameters. :param f: The parametric function. :return: the gradient. """ f.backward() g = self._get_gradient() self.zero_grad() return g
def backward(self, loss: torch.Tensor, optimizer: nn.Module, retain_graph: bool = False): r"""Use backward to scale the loss for mixed precision.""" if self.precision == "mixed": with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward(retain_graph=retain_graph) else: loss.backward(retain_graph=retain_graph) if not retain_graph: optimizer.step()
def _backprop_step(self, loss: Tensor, grad_clip: float = .1) -> None: # Clean gradients self.optimizer.zero_grad() # Backpropate the loss loss.backward() # Safeguard for Gradient explosion if isinstance(grad_clip, float): torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip) # Update the params self.optimizer.step()
def backward(self, loss: torch.Tensor): """Backward and update params. Args: loss (torch.Tensor): loss """ self.optim.zero_grad() loss.backward() self.optim.step()
def backward(self, tensor: torch.Tensor) -> None: """Computes the gradient of the specified tensor w.r.t. graph leaves. Args: tensor (torch.Tensor): Tensor of which the derivative will be computed. """ if self.amp_is_enabled: self.scaler.scale(tensor).backward() else: tensor.backward()
def bw_step(self, loss: torch.Tensor, optimizer: optim.Optimizer): if optimizer is None: return loss.backward(gradient=1 / self.steps) if self.is_start_cycle: optimizer.zero_grad() if self.is_end_cycle: optimizer.step() self.inc_counter()
def _backward_pass( self, targets: Tensor, in_queue: GpuAwareQueue[LocalBackwardData], out_queue: GpuAwareQueue[LocalBackwardData], inputs: Tensor, activations: Tensor, ) -> None: backward_data = cast(E2EBackwardData, in_queue.get(self.device)) activations.backward(gradient=backward_data.e2e_gradients) out_queue.put(E2EBackwardData(inputs.grad))
def optimize(opt: Optimizer, loss: torch.Tensor): """ Optimize the parameters based on the loss and the optimizer. Args: opt: optimizer loss: loss, a scalar """ opt.zero_grad() loss.backward() opt.step()
def backward(self, loss: torch.Tensor) -> None: """ Computer gradients with respect to the loss. Calls :func:`zero_grad` and then computes the gradient using `torch.Tensor.backward <https://pytorch.org/docs/stable/ tensors.html#torch.Tensor.backward>`_. See :mod:`torch.autograd` for more information. """ # TODO (aadcock): Add gradient accumulation logic self.zero_grad() loss.backward()
def _backward( self, loss: Tensor, opt: torch.optim.Optimizer, params: Optional[Iterable[Tensor]] = None, grad_clip: Optional[float] = None, ) -> None: opt.zero_grad() loss.backward() grad_clip = grad_clip or self.config.grad_clip if params is not None and grad_clip is not None: nn.utils.clip_grad_norm_(params, grad_clip) opt.step()