Example #1
0
 def backward(ctx, *grad_output):
     """
     In the backward pass we receive a Tensor containing the gradient of the loss
     with respect to the output, and we need to compute the gradient of the loss
     with respect to the input.
     """
     *param_list, scale = ctx.saved_tensors
     dscale_dL = sum(
         grad.sum() for grad in torch._foreach_mul(param_list, grad_output))
     dparam_dL = torch._foreach_mul(grad_output, float(scale))
     return (dscale_dL, ) + dparam_dL
Example #2
0
def _multi_tensor_rprop(params: List[Tensor], grads: List[Tensor],
                        prevs: List[Tensor], step_sizes: List[Tensor], *,
                        step_size_min: float, step_size_max: float,
                        etaminus: float, etaplus: float):

    if len(params) == 0:
        return

    signs = torch._foreach_mul(grads, prevs)
    signs = [s.sign() for s in signs]
    for sign in signs:
        sign[sign.gt(0)] = etaplus
        sign[sign.lt(0)] = etaminus
        sign[sign.eq(0)] = 1

    # update stepsizes with step size updates
    torch._foreach_mul_(step_sizes, signs)
    for step_size in step_sizes:
        step_size.clamp_(step_size_min, step_size_max)

    # for dir<0, dfdx=0
    # for dir>=0 dfdx=dfdx
    for i in range(len(grads)):
        grads[i] = grads[i].clone(memory_format=torch.preserve_format)
        grads[i][signs[i].eq(etaminus)] = 0

    # update parameters
    grad_signs = [grad.sign() for grad in grads]
    torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1)

    for i in range(len(prevs)):
        prevs[i].copy_(grads[i])
Example #3
0
def adagrad(params: List[Tensor], grads: List[Tensor],
            state_sums: List[Tensor], state_steps: List[int],
            has_sparse_grad: bool, *, lr: float, weight_decay: float,
            lr_decay: float, eps: float):
    r"""Functional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    """

    if weight_decay != 0:
        if has_sparse_grad:
            raise RuntimeError(
                "weight_decay option is not compatible with sparse gradients")
        torch._foreach_add_(grads, params, alpha=weight_decay)

    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]

    if has_sparse_grad:
        # sparse is not supported by multi_tensor. Fall back to optim.adagrad
        # implementation for sparse gradients
        for i, (param, grad, state_sum,
                step) in enumerate(zip(params, grads, state_sums,
                                       state_steps)):
            grad = grad.coalesce(
            )  # the update is non-linear so indices must be unique
            grad_indices = grad._indices()
            grad_values = grad._values()
            size = grad.size()

            state_sum.add_(_make_sparse(grad, grad_indices,
                                        grad_values.pow(2)))
            std_sparse = state_sum.sparse_mask(grad)
            std_sparse_values = std_sparse._values().sqrt_().add_(eps)
            param.add_(
                _make_sparse(grad, grad_indices,
                             grad_values / std_sparse_values),
                alpha=minus_clr[i],
            )
    else:
        grads = [
            torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
        ]
        state_sums = [
            torch.view_as_real(x) if torch.is_complex(x) else x
            for x in state_sums
        ]
        torch._foreach_addcmul_(state_sums, grads, grads, value=1)
        std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
        toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
        toAdd = [
            torch.view_as_complex(x) if torch.is_complex(params[i]) else x
            for i, x in enumerate(toAdd)
        ]
        torch._foreach_add_(params, toAdd)
        state_sums = [
            torch.view_as_complex(x) if torch.is_complex(params[i]) else x
            for i, x in enumerate(state_sums)
        ]
Example #4
0
 def forward(ctx, scale, *param_list):
     """
     In the forward pass we receive a Tensor containing the input and return
     a Tensor containing the output. ctx is a context object that can be used
     to stash information for backward computation. You can cache arbitrary
     objects for use in the backward pass using the ctx.save_for_backward method.
     """
     ctx.save_for_backward(*param_list, scale.detach())
     return torch._foreach_mul(param_list, float(scale))
Example #5
0
def _multi_tensor_rprop(params: List[Tensor],
                        grads: List[Tensor],
                        prevs: List[Tensor],
                        step_sizes: List[Tensor],
                        *,
                        step_size_min: float,
                        step_size_max: float,
                        etaminus: float,
                        etaplus: float,
                        maximize: bool):

    if len(params) == 0:
        return

    # Handle complex params
    def _view_complex_as_real(tensor_list):
        return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list]

    grads = _view_complex_as_real(grads)
    prevs = _view_complex_as_real(prevs)
    params = _view_complex_as_real(params)
    step_sizes = _view_complex_as_real(step_sizes)

    if maximize:
        grads = torch._foreach_neg(grads)

    signs = torch._foreach_mul(grads, prevs)
    signs = [s.sign() for s in signs]
    for sign in signs:
        sign[sign.gt(0)] = etaplus
        sign[sign.lt(0)] = etaminus
        sign[sign.eq(0)] = 1

    # update stepsizes with step size updates
    torch._foreach_mul_(step_sizes, signs)
    for step_size in step_sizes:
        step_size.clamp_(step_size_min, step_size_max)

    # for dir<0, dfdx=0
    # for dir>=0 dfdx=dfdx
    grads = list(grads)
    for i in range(len(grads)):
        grads[i] = grads[i].clone(memory_format=torch.preserve_format)
        grads[i][signs[i].eq(etaminus)] = 0

    # update parameters
    grad_signs = [grad.sign() for grad in grads]
    torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1)

    for i in range(len(prevs)):
        prevs[i].copy_(grads[i])
Example #6
0
def _multi_tensor_adagrad(params: List[Tensor], grads: List[Tensor],
                          state_sums: List[Tensor], state_steps: List[Tensor],
                          *, lr: float, weight_decay: float, lr_decay: float,
                          eps: float, has_sparse_grad: bool):

    # Foreach functions will throw errors if given empty lists
    if len(params) == 0:
        return

    if has_sparse_grad is None:
        has_sparse_grad = any([grad.is_sparse for grad in grads])

    if has_sparse_grad:
        return _single_tensor_adagrad(params,
                                      grads,
                                      state_sums,
                                      state_steps,
                                      lr=lr,
                                      weight_decay=weight_decay,
                                      lr_decay=lr_decay,
                                      eps=eps,
                                      has_sparse_grad=has_sparse_grad)

    # Update steps
    torch._foreach_add_(state_steps, 1)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]

    grads = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
    ]
    state_sums = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums
    ]
    torch._foreach_addcmul_(state_sums, grads, grads, value=1)
    std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
    toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
    toAdd = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(toAdd)
    ]
    torch._foreach_add_(params, toAdd)
    state_sums = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(state_sums)
    ]
Example #7
0
 def multi_tensor_scale(
     self,
     src: Sequence[torch.Tensor],
     dst: Sequence[torch.Tensor],
     scale: float,
 ) -> None:
     with torch.no_grad():  # type: ignore[no-untyped-call]
         # _foreach_zero for long type is not supported in CUDA
         if self._enable_foreach and src[0].is_floating_point():
             # scale
             val = torch._foreach_mul(tuple(src), scale)
             # copy tensor
             torch._foreach_zero_(tuple(dst))
             torch._foreach_add_(tuple(dst), val)
         else:
             for s, d in zip(src, dst):
                 d.copy_(s * scale)
Example #8
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        grads = []
        states = []
        params_with_grad = []
        step_sizes = []

        for group in self.param_groups:
            for p in group['params']:
                etaminus, etaplus = group['etas']
                step_size_min, step_size_max = group['step_sizes']

                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            'RMSprop does not support sparse gradients')

                    grads.append(p.grad)
                    params_with_grad.append(p)

                    state = self.state[p]
                    # State initialization
                    if len(state) == 0:
                        state['step'] = 0
                        state['prev'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)
                        state['step_size'] = p.grad.new().resize_as_(
                            p.grad).fill_(group['lr'])

                        state['step'] += 1

                    states.append(state)
                    step_sizes.append(state['step_size'])

            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
            signs = [s.sign() for s in signs]
            for sign in signs:
                sign[sign.gt(0)] = etaplus
                sign[sign.lt(0)] = etaminus
                sign[sign.eq(0)] = 1

            # update stepsizes with step size updates
            torch._foreach_mul_(step_sizes, signs)
            for step_size in step_sizes:
                step_size.clamp_(step_size_min, step_size_max)

            # for dir<0, dfdx=0
            # for dir>=0 dfdx=dfdx
            for i in range(len(grads)):
                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
                grads[i][signs[i].eq(etaminus)] = 0

            # update parameters
            grad_signs = [grad.sign() for grad in grads]
            torch._foreach_addcmul_(params_with_grad,
                                    grad_signs,
                                    step_sizes,
                                    value=-1)

            for i in range(len(states)):
                states[i]['prev'].copy_(grads[i])

        return loss
Example #9
0
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor],
                        exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor],
                        max_exp_avg_sqs: List[Tensor],
                        state_steps: List[Tensor], *, amsgrad: bool,
                        beta1: float, beta2: float, lr: float,
                        weight_decay: float, eps: float, maximize: bool,
                        capturable: bool):
    if len(params) == 0:
        return

    if capturable:
        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
            "If capturable=True, params and state_steps must be CUDA tensors."

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    grads = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
    ]
    exp_avgs = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs
    ]
    exp_avg_sqs = [
        torch.view_as_real(x) if torch.is_complex(x) else x
        for x in exp_avg_sqs
    ]
    params = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in params
    ]

    # update steps
    torch._foreach_add_(state_steps, 1)

    # Perform stepweight decay
    torch._foreach_mul_(params, 1 - lr * weight_decay)

    # Decay the first and second moment running average coefficient
    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    if capturable:
        # TODO: use foreach_pow if/when foreach_pow is added
        bias_correction1 = [torch.pow(beta1, step) for step in state_steps]
        bias_correction2 = [torch.pow(beta2, step) for step in state_steps]
        # foreach_sub doesn't allow a scalar as the first arg
        torch._foreach_sub_(bias_correction1, 1)
        torch._foreach_sub_(bias_correction2, 1)
        torch._foreach_neg_(bias_correction1)
        torch._foreach_neg_(bias_correction2)

        # foreach_div doesn't allow a scalar as the first arg
        step_size = torch._foreach_div(bias_correction1, lr)
        torch._foreach_reciprocal_(step_size)
        torch._foreach_neg_(step_size)

        bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2)

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
            # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
            torch._foreach_div_(
                max_exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(
                exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size)

        torch._foreach_addcdiv_(params, exp_avgs, denom)
    else:
        bias_correction1 = [1 - beta1**step.item() for step in state_steps]
        bias_correction2 = [1 - beta2**step.item() for step in state_steps]

        step_size = [(lr / bc) * -1 for bc in bias_correction1]

        bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2]

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

        torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)