Beispiel #1
0
    def backward(
        self,
        loss: torch.Tensor,
        optimizer: torch.optim.Optimizer,
        scheduler,
        step: bool,
    ):
        """Handling back-propagation."""
        if self.cfg.fp16:
            from apex import amp

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            if self.cfg.train.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), self.cfg.train.max_grad_norm
                )
        else:
            loss.backward()
            if self.cfg.train.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(
                    self.model.parameters(), self.cfg.train.max_grad_norm
                )

        if step:
            optimizer.step()
            scheduler.step()
            self.model.zero_grad()
def optimizer_step_spottune(optimizer_main: Optimizer,
                            optimizer_policy: Optimizer, loss: torch.Tensor,
                            **params) -> torch.Tensor:
    """
    Performs the backward pass with respect to ``loss``, as well as a gradient step.

    ``params`` is used to change the optimizer's parameters.

    Examples
    --------
    >>> optimizer = Adam(model.parameters(), lr=1)
    >>> optimizer_step(optimizer, loss) # perform a gradient step
    >>> optimizer_step(optimizer, loss, lr=1e-3) # set lr to 1e-3 and perform a gradient step
    >>> optimizer_step(optimizer, loss, betas=(0, 0)) # set betas to 0 and perform a gradient step

    Notes
    -----
    The incoming ``optimizer``'s parameters are not restored to their original values.
    """
    lr_main, lr_policy = params['lr_main'], params['lr_policy']

    set_params(optimizer_main, lr=lr_main)
    set_params(optimizer_policy, lr=lr_policy)

    optimizer_main.zero_grad()
    optimizer_policy.zero_grad()

    loss.backward()
    optimizer_main.step()
    optimizer_policy.step()

    return loss
Beispiel #3
0
 def backward(self, loss:torch.Tensor, multi_optim:MultiOptim)->None:
     if self.is_mixed():
         optim = self._get_optim(multi_optim)
         with self._amp.scale_loss(loss, optim) as scaled_loss:
             scaled_loss.backward()
     else:
         loss.backward()
    def _run_backward(self, tensor: Tensor, model: Optional[Module],
                      *args: Any, **kwargs: Any) -> None:
        """Lightning-independent backward logic.

        Currently only used by Lightning Lite. Subject to further refactors.
        """
        tensor.backward(*args, **kwargs)
Beispiel #5
0
def policy_svg(policy: TVLinearPolicy, value: Tensor) -> lqr.Linear:
    """Computes the policy SVG from the estimated return."""
    # pylint:disable=invalid-name
    policy.zero_grad(set_to_none=True)
    value.backward()
    K, k = policy.standard_form()
    return K.grad.clone(), k.grad.clone()
 def _backpropagate(self, loss: torch.Tensor):
     self.optimizer.zero_grad()
     loss.backward()
     if self.gradient_clipping_value is not None:
         torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                        self.gradient_clipping_value)
     self.optimizer.step()
    def backward_step(self, model: nn.Module, loss: torch.Tensor,
                      optimizer: optim.Optimizer, scaler: amp.GradScaler):
        if optimizer is None:
            return

        loss = loss / self.steps
        if scaler is not None:
            loss = scaler.scale(loss)

        if self.is_start_cycle:
            optimizer.zero_grad()

        if isinstance(
                model,
                nn.parallel.DistributedDataParallel) and not self.is_end_cycle:
            with model.no_sync():
                loss.backward()
        else:
            loss.backward()

        if self.is_end_cycle:
            if scaler is None:
                optimizer.step()
            else:
                scaler.step(optimizer)
                scaler.update()

        self.inc_counter()
Beispiel #8
0
def _sp_double_backward_update(pos_out: Tensor,
                               neg_out: Tensor,
                               param: Parameter,
                               gamma: float,
                               l1_reg: float,
                               l2_reg: float,
                               pos: Tensor = None):
    param.grad = None
    # first backward
    neg_out.backward()
    neg = param.grad.relu_().add_(eps)

    if pos is None:
        param.grad = None
        pos_out.backward()
        pos = param.grad.relu_().add_(eps)

    if l1_reg > 0:
        pos.add_(l1_reg)
    if l2_reg > 0:
        pos = pos.add(param.data, alpha=l2_reg)
    multiplier = neg.div_(pos)
    if gamma != 1:
        multiplier.pow_(gamma)
    param.data.mul_(multiplier)
Beispiel #9
0
def backprop(loss: torch.Tensor, model: torch.nn.Module, optimizer):
    optimizer.zero_grad()
    loss.backward()
#    for i, param in enumerate(model.parameters()):
#        print(param.shape)
#        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Beispiel #10
0
def step(iteration: int, loss: Tensor, optimizer: Adam,
         scheduler: ExponentialLR) -> None:
    """Do one training step."""
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step(iteration)
Beispiel #11
0
 def optimize_agent_network(loss: Tensor) -> None:
     # since PPO optimizes multiple times per batch, logging is done in
     # a separate function, log_ppo_stats
     agent_optimizer.zero_grad()
     loss.backward()
     nn.utils.clip_grad_norm_(agent_network.parameters(), max_norm=0.5)
     agent_optimizer.step()
Beispiel #12
0
    def update_action_values(self, guess: torch.Tensor, error, state_hash):
        # error = torch.from_numpy(error).float()
        # initial_gradient = torch.randn([1, 1, 3, 3])
        # guess.retain_grad()
        # guess.backward(initial_gradient) ##  We now have the grad in each layer
        # torch.nn.utils.clip_grad_value_(self.conv1.weight, 0.1)
        # torch.nn.utils.clip_grad_value_(self.conv2.weight, 0.1)
        # torch.nn.utils.clip_grad_value_(self.conv3.weight, 0.1)
        # torch.nn.utils.clip_grad_value_(self.conv_final.weight, 0.1)



        initial_gradient = torch.ones([1, 1, 3, 3]).to('cuda')
        guess.retain_grad()
        guess.backward(initial_gradient, retain_graph=True)


        for name, parameter in self.named_parameters():
            if not parameter.requires_grad:
                continue
            if state_hash not in self.e[name]:
                self.e[name][state_hash] = 0
            for state in self.e[name].keys():
                self.e[name][state] = self.e[name][state] * self.lambd * self.gamma
            self.e[name][state_hash] += parameter.grad
            for state in self.e[name].keys():
                parameter.data -= self.alpha * error * self.e[name][state]
            parameter.grad.zero_()
Beispiel #13
0
def compute_grad_receptive_field(mod: ModuleInfo, input: Tensor,
                                 output: Tensor,
                                 infos: ModuleInfoIndex) -> Tuple[int, ...]:

    if not isinstance(output, Tensor):
        # We cannot backward() from this. Skip, we just won't be able
        # to compute the receptive field here without some other
        # affordance to combine the tensors in this output into a
        # "loss"
        return (-1, )

    # https://github.com/rogertrullo/Receptive-Field-in-Pytorch/blob/master/Receptive_Field.ipynb
    fake_grad = torch.zeros(output.shape)
    # batch=0, channel=0
    center_pos = (0, 0, *[i // 2 for i in output.shape[2:]])
    fake_grad[center_pos] = 1

    # zero_grad everything before and including this module, since backward() accumulates
    for i in range(0, mod.input_order + 1):
        infos.by_input_order[i].module.zero_grad()

    # retain_graph so we can run this multiple times and not drop the
    # intermediate results from forward()
    output.backward(gradient=fake_grad, retain_graph=True)

    # Find the extent of pixels affected; drop batch/channel
    nonzero_idxs = input.grad.nonzero(as_tuple=True)[2:]
    rf_dims = [(d.max() - d.min() + 1).item() for d in nonzero_idxs]
    return tuple(rf_dims)
    def backward(self, trainer, loss: Tensor, optimizer: Optimizer,
                 optimizer_idx: int) -> None:
        """Override backward with your own implementation if you need to

        :param trainer: Pointer to the trainer
        :param loss: Loss is already scaled by accumulated grads
        :param optimizer: Current optimizer being used
        :param optimizer_idx: Index of the current optimizer being used

        Called to perform backward step.
        Feel free to override as needed.

        The loss passed in has already been scaled for accumulated gradients if requested.

        .. code-block:: python

            def backward(self, use_amp, loss, optimizer):
                if use_amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

        """
        if trainer.precision == 16:

            # .backward is not special on 16-bit with TPUs
            if not trainer.on_tpu:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
        else:
            loss.backward()
Beispiel #15
0
 def _backprop_step(self, loss: Tensor) -> None:
     # Clean gradients
     self.optimizer.zero_grad()
     # Backpropate the loss
     loss.backward()
     # Update the params
     self.optimizer.step()
Beispiel #16
0
            def after_loss_fn_new(_input: torch.Tensor, _label: torch.Tensor, _output: torch.Tensor,
                                  loss: torch.Tensor, optimizer: Optimizer, loss_fn: Callable[..., torch.Tensor] = None,
                                  amp: bool = False, scaler: torch.cuda.amp.GradScaler = None, **kwargs):
                noise = torch.zeros_like(_input)
                adv_loss_fn = functools.partial(self.adv_loss, _label=_label)

                for m in range(self.pgd.iteration):
                    if amp:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    self.eval()
                    adv_x, _ = self.pgd.optimize(_input=_input, noise=noise,
                                                 loss_fn=adv_loss_fn,
                                                 iteration=1, epsilon=adv_train_epsilon)
                    self.train()
                    loss = loss_fn(adv_x, _label)
                    if callable(after_loss_fn_old):
                        after_loss_fn_old(_input=_input, _label=_label, _output=_output,
                                          loss=loss, optimizer=optimizer, loss_fn=loss_fn,
                                          amp=amp, scaler=scaler, **kwargs)
                    if amp:
                        scaler.scale(loss).backward()
                    else:
                        loss.backward()
    def backward(
        self,
        model: 'LightningModule',
        closure_loss: torch.Tensor,
        optimizer: 'Optimizer',
        opt_idx: int,
        should_accumulate: bool,
        *args: Any,
        **kwargs: Any,
    ) -> torch.Tensor:
        """performs the actual backpropagation

        Args:
            model: the model to be optimized
            closure_loss: the loss value obtained from the closure
            optimizer: the optimizer to perform the step lateron
            opt_idx: the optimizer's index
            should_accumulate: whether to accumulate gradients or not

        """
        automatic_optimization = model.automatic_optimization

        # do backward pass
        if automatic_optimization:
            model.backward(closure_loss, optimizer, opt_idx)
        else:
            closure_loss.backward(*args, **kwargs)

        # once backward has been applied, release graph
        closure_loss = closure_loss.detach()

        return closure_loss
Beispiel #18
0
 def update_gradient(self, loss: torch.Tensor):
     """
     1. back-propagation using auto-grad backward()
     2. update model using optimizer step
     :param loss: loss tensor from criterion output
     """
     loss.backward()
     self.optimizer.step()
def update_networks_on_loss(loss: torch.Tensor, *networks) -> None:
    if not loss:
        return
    for network in networks:
        network.zero_grad()
    loss.backward()
    for network in networks:
        network.optimizer.step()
Beispiel #20
0
    def minimize(
        self,
        loss: torch.Tensor,
        optimizer_name: str = "primary",
        *,
        retain_graph: bool = False,
        checkpoint_interval: Optional[float] = None,
        clip_grad_max_norm: Optional[float] = None,
    ) -> None:
        """Compute gradients and use them to minimize a loss function."""
        model = cast("Buddy", self)._model
        assert model is not None, "No model attached!"

        # Get optimizer
        self._instantiate_optimizer(optimizer_name)
        optimizer: torch.optim.Optimizer = self._optimizer_dict[optimizer_name]

        # Update learning rate using scheduler if possible
        schedulers = self._optimizer_config.learning_rate_schedulers
        if optimizer_name in schedulers:
            self._set_learning_rate(
                schedulers[optimizer_name](self._optimizer_config.global_steps),
                optimizer_name,
            )

        # Take gradient step
        optimizer.zero_grad()
        loss.backward(retain_graph=retain_graph)  # type: ignore
        if clip_grad_max_norm is not None:
            torch.nn.utils.clip_grad_norm_(
                optimizer.param_groups[0]["params"],
                max_norm=clip_grad_max_norm,
            )
        optimizer.step()

        # Update global step count
        self._optimizer_config.global_steps += 1

        # Autocheckpoint procedure
        if checkpoint_interval is None:
            checkpoint_interval = self._optimizer_checkpoint_interval

        # Disable autocheckpoint if interval is 0
        if checkpoint_interval == 0:
            return

        if self._optimizer_last_checkpoint_time is None:
            # First iteration
            self._optimizer_last_checkpoint_time = time.time()
        elif (
            time.time() - cast(float, self._optimizer_last_checkpoint_time)
            > self._optimizer_checkpoint_interval
        ):  # pragma: no cover
            # Checkpoint!
            cast("_BuddyCheckpointing", self).save_checkpoint()
            self._optimizer_last_checkpoint_time = time.time()
Beispiel #21
0
 def _get_gradient_from_torch(self, f: torch.Tensor):
     """
     Get the gradient of f w.r.t. the policy's parameters.
     :param f: The parametric function.
     :return: the gradient.
     """
     f.backward()
     g = self._get_gradient()
     self.zero_grad()
     return g
Beispiel #22
0
 def backward(self, loss: torch.Tensor,  optimizer: nn.Module,
              retain_graph: bool = False):
     r"""Use backward to scale the loss for mixed precision."""
     if self.precision == "mixed":
         with amp.scale_loss(loss, optimizer) as scaled_loss:
             scaled_loss.backward(retain_graph=retain_graph)
     else:
         loss.backward(retain_graph=retain_graph)
     if not retain_graph:
         optimizer.step()
Beispiel #23
0
 def _backprop_step(self, loss: Tensor, grad_clip: float = .1) -> None:
     # Clean gradients
     self.optimizer.zero_grad()
     # Backpropate the loss
     loss.backward()
     # Safeguard for Gradient explosion
     if isinstance(grad_clip, float):
         torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip)
     # Update the params
     self.optimizer.step()
Beispiel #24
0
    def backward(self, loss: torch.Tensor):
        """Backward and update params.

        Args:
            loss (torch.Tensor): loss
        """

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
Beispiel #25
0
    def backward(self, tensor: torch.Tensor) -> None:
        """Computes the gradient of the specified tensor w.r.t. graph leaves.

        Args:
            tensor (torch.Tensor): Tensor of which the derivative will be computed.
        """
        if self.amp_is_enabled:
            self.scaler.scale(tensor).backward()
        else:
            tensor.backward()
Beispiel #26
0
    def bw_step(self, loss: torch.Tensor, optimizer: optim.Optimizer):
        if optimizer is None:
            return

        loss.backward(gradient=1 / self.steps)
        if self.is_start_cycle:
            optimizer.zero_grad()
        if self.is_end_cycle:
            optimizer.step()

        self.inc_counter()
 def _backward_pass(
     self,
     targets: Tensor,
     in_queue: GpuAwareQueue[LocalBackwardData],
     out_queue: GpuAwareQueue[LocalBackwardData],
     inputs: Tensor,
     activations: Tensor,
 ) -> None:
     backward_data = cast(E2EBackwardData, in_queue.get(self.device))
     activations.backward(gradient=backward_data.e2e_gradients)
     out_queue.put(E2EBackwardData(inputs.grad))
Beispiel #28
0
    def optimize(opt: Optimizer, loss: torch.Tensor):
        """
        Optimize the parameters based on the loss and the optimizer.

        Args:
            opt: optimizer
            loss: loss, a scalar
        """
        opt.zero_grad()
        loss.backward()
        opt.step()
    def backward(self, loss: torch.Tensor) -> None:
        """
        Computer gradients with respect to the loss.

        Calls :func:`zero_grad` and then computes the gradient using
        `torch.Tensor.backward <https://pytorch.org/docs/stable/
        tensors.html#torch.Tensor.backward>`_. See :mod:`torch.autograd` for
        more information.
        """
        # TODO (aadcock): Add gradient accumulation logic
        self.zero_grad()
        loss.backward()
Beispiel #30
0
 def _backward(
     self,
     loss: Tensor,
     opt: torch.optim.Optimizer,
     params: Optional[Iterable[Tensor]] = None,
     grad_clip: Optional[float] = None,
 ) -> None:
     opt.zero_grad()
     loss.backward()
     grad_clip = grad_clip or self.config.grad_clip
     if params is not None and grad_clip is not None:
         nn.utils.clip_grad_norm_(params, grad_clip)
     opt.step()