def _multi_tensor_adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool): if len(params) == 0: return if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float): r"""Functional API that performs Adadelta algorithm computation. See :class:`~torch.optim.Adadelta` for details. """ if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], grad_avgs: List[Tensor], momentum_buffer_list: List[Tensor], *, lr: float, alpha: float, eps: float, weight_decay: float, momentum: float, centered: bool, maximize: bool, differentiable: bool): if len(params) == 0: return assert not differentiable, "_foreach ops don't support autograd" if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) def _view_complex_as_real(tensor_list): return [ torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list ] grads = _view_complex_as_real(grads) params = _view_complex_as_real(params) square_avgs = _view_complex_as_real(square_avgs) torch._foreach_mul_(square_avgs, alpha) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) if centered: grad_avgs = _view_complex_as_real(grad_avgs) torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, eps) else: avg = torch._foreach_sqrt(square_avgs) torch._foreach_add_(avg, eps) if momentum > 0: momentum_buffer_list = _view_complex_as_real(momentum_buffer_list) torch._foreach_mul_(momentum_buffer_list, momentum) torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) else: torch._foreach_addcdiv_(params, grads, avg, value=-lr)
def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: grads = [] params_with_grad = [] states = [] alpha = group['alpha'] square_avg = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'RMSprop does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) if group['momentum'] > 0: state['momentum_buffer'] = torch.zeros_like( p, memory_format=torch.preserve_format) if group['centered']: state['grad_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['step'] += 1 states.append(state) square_avg.append(state['square_avg']) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) torch._foreach_mul_(square_avg, alpha) torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha) if group['centered']: grad_avgs = [s['grad_avg'] for s in states] torch._foreach_mul_(grad_avgs, alpha) torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1) torch._foreach_sqrt_(avg) torch._foreach_add_(avg, group['eps']) else: avg = torch._foreach_sqrt(square_avg) torch._foreach_add_(avg, group['eps']) if group['momentum'] > 0: buf = [s['momentum_buffer'] for s in states] torch._foreach_mul_(buf, group['momentum']) torch._foreach_addcdiv_(buf, grads, avg) torch._foreach_add_(params_with_grad, buf, alpha=-group['lr']) else: torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr']) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: grads = [] params_with_grad = [] states = [] square_avgs = [] acc_deltas = [] rho, eps = group['rho'], group['eps'] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'Adadelta does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['acc_delta'] = torch.zeros_like( p, memory_format=torch.preserve_format) square_avgs.append(state['square_avg']) acc_deltas.append(state['acc_delta']) state['step'] += 1 states.append(state) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr']) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho) return loss