def test_add_scalar_with_different_scalar_type(self, device): # int tensor with float scalar # should go 'slow' route scalar = 1.1 tensors = [torch.tensor([1], dtype=torch.int, device=device)] res = torch._foreach_add(tensors, scalar) self.assertEqual(res, [torch.tensor([2.1], device=device)]) # float tensor with int scalar # should go 'fast' route scalar = 1 tensors = [torch.tensor([1.1], device=device)] res = torch._foreach_add(tensors, scalar) self.assertEqual(res, [torch.tensor([2.1], device=device)]) # bool tensor with int scalar # should go 'slow' route scalar = 1 tensors = [torch.tensor([False], device=device)] res = torch._foreach_add(tensors, scalar) self.assertEqual(res, [torch.tensor([1], device=device)]) # bool tensor with float scalar # should go 'slow' route scalar = 1.1 tensors = [torch.tensor([False], device=device)] res = torch._foreach_add(tensors, scalar) self.assertEqual(res, [torch.tensor([1.1], device=device)])
def adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float): r"""Functional API that performs Adadelta algorithm computation. See :class:`~torch.optim.Adadelta` for details. """ if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def _multi_tensor_adadelta(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float, weight_decay: float, rho: float, eps: float, maximize: bool): if len(params) == 0: return if maximize: grads = torch._foreach_neg(grads) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params, deltas, alpha=-lr) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
def test_complex_scalar(self, device, dtype): tensors = [ torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10) ] complex_scalar = 3 + 5j # bool tensor + 1 will result in int64 tensor expected = [ torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10) ] if dtype in [ torch.float16, torch.float32, torch.float64, torch.bfloat16 ] and device == 'cuda:0': # value cannot be converted to dtype without overflow: self.assertRaises( RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) self.assertRaises( RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar)) return res = torch._foreach_add(tensors, complex_scalar) self.assertEqual(res, expected) if dtype not in [torch.complex64, torch.complex128]: self.assertRaises( RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) else: torch._foreach_add_(tensors, complex_scalar) self.assertEqual(res, tensors)
def _multi_tensor_adam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size = [(lr / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def test_int_scalar(self, device, dtype): tensors = [ torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10) ] int_scalar = 1 # bool tensor + 1 will result in int64 tensor if dtype == torch.bool: expected = [ torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10) ] else: expected = [ torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10) ] res = torch._foreach_add(tensors, int_scalar) self.assertEqual(res, expected) if dtype in [torch.bool]: with self.assertRaisesRegex( RuntimeError, "result type Long can't be cast to the desired output type Bool" ): torch._foreach_add_(tensors, int_scalar) else: torch._foreach_add_(tensors, int_scalar) self.assertEqual(res, tensors)
def test_float_scalar(self, device, dtype): tensors = [ torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10) ] float_scalar = 1. # float scalar + integral tensor will result in float tensor if dtype in [ torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, torch.bool ]: expected = [ torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10) ] else: expected = [ torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10) ] res = torch._foreach_add(tensors, float_scalar) self.assertEqual(res, expected) if dtype in [ torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, torch.bool ]: self.assertRaises( RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar)) else: torch._foreach_add_(tensors, float_scalar) self.assertEqual(res, tensors)
def test_add_scalar_with_different_size_tensors(self, device, dtype): N = 20 H = 20 W = 20 tensors = [] size_change = 0 for _ in range(N): tensors.append( torch.zeros(H + size_change, W + size_change, device=device, dtype=dtype)) size_change += 1 res = torch._foreach_add(tensors, 1) size_change = 0 for t in res: if dtype == torch.bool: dtype = torch.int64 self.assertEqual( t, torch.ones(H + size_change, W + size_change, device=device, dtype=dtype)) size_change += 1
def _multi_tensor_sgd(params: List[Tensor], grads: List[Tensor], momentum_buffer_list: List[Optional[Tensor]], *, weight_decay: float, momentum: float, lr: float, dampening: float, nesterov: bool, maximize: bool, has_sparse_grad: bool): if len(params) == 0: return if has_sparse_grad is None: has_sparse_grad = any(grad.is_sparse for grad in grads) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] if weight_decay != 0: grads = torch._foreach_add(grads, params, alpha=weight_decay) if momentum != 0: bufs = [] all_states_with_momentum_buffer = True for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: all_states_with_momentum_buffer = False break else: bufs.append(momentum_buffer_list[i]) if all_states_with_momentum_buffer: torch._foreach_mul_(bufs, momentum) torch._foreach_add_(bufs, grads, alpha=1 - dampening) else: bufs = [] for i in range(len(momentum_buffer_list)): if momentum_buffer_list[i] is None: buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach() else: buf = momentum_buffer_list[i] buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) bufs.append(buf) if nesterov: torch._foreach_add_(grads, bufs, alpha=momentum) else: grads = bufs if not has_sparse_grad: torch._foreach_add_(params, grads, alpha=-lr) else: # foreach APIs dont support sparse for i in range(len(params)): params[i].add_(grads[i], alpha=-lr)
def test_add_list_slow_path(self, device, dtype): # different strides tensor1 = torch.zeros(10, 10, device=device, dtype=dtype) tensor2 = torch.ones(10, 10, device=device, dtype=dtype) res = torch._foreach_add([tensor1], [tensor2.t()]) torch._foreach_add_([tensor1], [tensor2]) self.assertEqual(res, [tensor1]) # non contiguous tensor1 = torch.randn(5, 2, 1, 3, device=device)[:, 0] tensor2 = torch.randn(5, 2, 1, 3, device=device)[:, 0] self.assertFalse(tensor1.is_contiguous()) self.assertFalse(tensor2.is_contiguous()) res = torch._foreach_add([tensor1], [tensor2]) torch._foreach_add_([tensor1], [tensor2]) self.assertEqual(res, [tensor1])
def test_add_list_different_sizes(self, device, dtype): tensors1 = [torch.zeros(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)] tensors2 = [torch.ones(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)] res = torch._foreach_add(tensors1, tensors2) torch._foreach_add_(tensors1, tensors2) self.assertEqual(res, tensors1) self.assertEqual(res, [torch.ones(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)])
def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype): # TODO: enable empty list case for tensors in [[torch.randn([0])]]: res = torch._foreach_add(tensors, 1) self.assertEqual(res, tensors) torch._foreach_add_(tensors, 1) self.assertEqual(res, tensors)
def adagrad(params: List[Tensor], grads: List[Tensor], state_sums: List[Tensor], state_steps: List[int], has_sparse_grad: bool, *, lr: float, weight_decay: float, lr_decay: float, eps: float): r"""Functional API that performs Adagrad algorithm computation. See :class:`~torch.optim.Adagrad` for details. """ if weight_decay != 0: if has_sparse_grad: raise RuntimeError( "weight_decay option is not compatible with sparse gradients") torch._foreach_add_(grads, params, alpha=weight_decay) minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps] if has_sparse_grad: # sparse is not supported by multi_tensor. Fall back to optim.adagrad # implementation for sparse gradients for i, (param, grad, state_sum, step) in enumerate(zip(params, grads, state_sums, state_steps)): grad = grad.coalesce( ) # the update is non-linear so indices must be unique grad_indices = grad._indices() grad_values = grad._values() size = grad.size() state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2))) std_sparse = state_sum.sparse_mask(grad) std_sparse_values = std_sparse._values().sqrt_().add_(eps) param.add_( _make_sparse(grad, grad_indices, grad_values / std_sparse_values), alpha=minus_clr[i], ) else: grads = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in grads ] state_sums = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums ] torch._foreach_addcmul_(state_sums, grads, grads, value=1) std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps) toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std) toAdd = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(toAdd) ] torch._foreach_add_(params, toAdd) state_sums = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(state_sums) ]
def test_add_scalar_with_overlapping_tensors(self, device, dtype): tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)] expected = [torch.tensor([[[2, 2, 2]], [[2, 2, 2]]], dtype=dtype, device=device)] # bool tensor + 1 will result in int64 tensor if dtype == torch.bool: expected[0] = expected[0].to(torch.int64).add(1) res = torch._foreach_add(tensors, 1) self.assertEqual(res, expected)
def test_bool_scalar(self, device, dtype): tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] bool_scalar = True expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] res = torch._foreach_add(tensors, bool_scalar) self.assertEqual(res, expected) torch._foreach_add_(tensors, bool_scalar) self.assertEqual(res, tensors)
def test_add_list_same_size(self, device, dtype): tensors1 = [ torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10) ] tensors2 = [ torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10) ] res = torch._foreach_add(tensors1, tensors2) torch._foreach_add_(tensors1, tensors2) self.assertEqual(res, tensors1) self.assertEqual(res[0], torch.ones(10, 10, device=device, dtype=dtype))
def test_add_scalar_with_different_tensor_dtypes(self, device): tensors = [ torch.tensor([1], dtype=torch.float, device=device), torch.tensor([1], dtype=torch.int, device=device) ] expected = [ torch.tensor([2], dtype=torch.float, device=device), torch.tensor([2], dtype=torch.int, device=device) ] res = torch._foreach_add(tensors, 1) self.assertEqual(res, expected)
def test_add_scalar_with_same_size_tensors(self, device, dtype): N = 20 H = 20 W = 20 tensors = [] for _ in range(N): tensors.append(torch.zeros(H, W, device=device, dtype=dtype)) res = torch._foreach_add(tensors, 1) for t in res: if dtype == torch.bool: dtype = torch.int64 self.assertEqual(t, torch.ones(H, W, device=device, dtype=dtype))
def _multi_tensor_nadam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], mu_products: List[Tensor], state_steps: List[Tensor], *, beta1: float, beta2: float, lr: float, weight_decay: float, momentum_decay: float, eps: float): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] mus = [ beta1 * (1. - 0.5 * (0.96**(step.item() * momentum_decay))) for step in state_steps ] mu_nexts = [ beta1 * (1. - 0.5 * (0.96**((step.item() + 1) * momentum_decay))) for step in state_steps ] # update mu_products torch._foreach_mul_(mu_products, mus) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size_grads = [(lr * (1. - mu) / (1. - mu_product.item())) * -1 for mu_product, mu in zip(mu_products, mus)] step_size_expavg = [ (lr * mu_next / (1. - mu_product.item() * mu_next)) * -1 for mu_product, mu_next in zip(mu_products, mu_nexts) ] torch._foreach_addcdiv_(params, grads, denom, step_size_grads) torch._foreach_addcdiv_(params, exp_avgs, denom, step_size_expavg)
def _multi_tensor_adagrad(params: List[Tensor], grads: List[Tensor], state_sums: List[Tensor], state_steps: List[Tensor], *, lr: float, weight_decay: float, lr_decay: float, eps: float, has_sparse_grad: bool): # Foreach functions will throw errors if given empty lists if len(params) == 0: return if has_sparse_grad is None: has_sparse_grad = any([grad.is_sparse for grad in grads]) if has_sparse_grad: return _single_tensor_adagrad(params, grads, state_sums, state_steps, lr=lr, weight_decay=weight_decay, lr_decay=lr_decay, eps=eps, has_sparse_grad=has_sparse_grad) # Update steps torch._foreach_add_(state_steps, 1) if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps] grads = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in grads ] state_sums = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums ] torch._foreach_addcmul_(state_sums, grads, grads, value=1) std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps) toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std) toAdd = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(toAdd) ] torch._foreach_add_(params, toAdd) state_sums = [ torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(state_sums) ]
def nadam(params: List[Tensor], grads: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], mu_products: List[Tensor], states: List[Dict], *, beta1: float, beta2: float, lr: float, weight_decay: float, momentum_decay: float, eps: float): r"""Functional API that performs NAdam algorithm computation. See :class:`~torch.optim.NAdam` for details. """ bias_correction1 = [1 - beta1 ** state['step'] for state in states] bias_correction2 = [1 - beta2 ** state['step'] for state in states] mus = [beta1 * (1. - 0.5 * (0.96 ** (state['step'] * momentum_decay))) for state in states] mu_nexts = [beta1 * (1. - 0.5 * (0.96 ** ((state['step'] + 1) * momentum_decay))) for state in states] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size_grads = [(lr * (1. - mu) / (1. - mu_product)) * -1 for mu_product, mu in zip(mu_products, mus)] step_size_expavg = [(lr * mu_next / (1. - mu_product * mu_next)) * -1 for mu_product, mu_next in zip(mu_products, mu_nexts)] torch._foreach_addcdiv_(params, grads, denom, step_size_grads) torch._foreach_addcdiv_(params, exp_avg, denom, step_size_expavg)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group['amsgrad'] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'Adam does not support sparse gradients, please consider SparseAdam instead' ) params_with_grad.append(p) grads.append(p.grad) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) exp_avg.append(state['exp_avg']) exp_avg_sq.append(state['exp_avg_sq']) if amsgrad: max_exp_avg_sq.append(state['max_exp_avg_sq']) state['step'] += 1 states.append(state) beta1, beta2 = group['betas'] bias_correction1 = [1 - beta1**state['step'] for state in states] bias_correction2 = [1 - beta2**state['step'] for state in states] if group['weight_decay'] != 0: grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay']) # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum( max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss
def test_add_list_error_cases(self, device): tensors1 = [] tensors2 = [] # Empty lists with self.assertRaises(RuntimeError): torch._foreach_add(tensors1, tensors2) with self.assertRaises(RuntimeError): torch._foreach_add_(tensors1, tensors2) # One empty list tensors1.append(torch.tensor([1], device=device)) with self.assertRaisesRegex( RuntimeError, "Tensor list must have at least one tensor."): torch._foreach_add(tensors1, tensors2) with self.assertRaisesRegex( RuntimeError, "Tensor list must have at least one tensor."): torch._foreach_add_(tensors1, tensors2) # Lists have different amount of tensors tensors2.append(torch.tensor([1], device=device)) tensors2.append(torch.tensor([1], device=device)) with self.assertRaisesRegex( RuntimeError, "Tensor lists must have the same number of tensors, got 1 and 2" ): torch._foreach_add(tensors1, tensors2) with self.assertRaisesRegex( RuntimeError, "Tensor lists must have the same number of tensors, got 1 and 2" ): torch._foreach_add_(tensors1, tensors2) # Different dtypes tensors1 = [ torch.zeros(10, 10, device=device, dtype=torch.float) for _ in range(10) ] tensors2 = [ torch.ones(10, 10, device=device, dtype=torch.int) for _ in range(10) ] with self.assertRaisesRegex( RuntimeError, "All tensors in the tensor list must have the same dtype."): torch._foreach_add(tensors1, tensors2) with self.assertRaisesRegex( RuntimeError, "All tensors in the tensor list must have the same dtype."): torch._foreach_add_(tensors1, tensors2) # different devices if torch.cuda.is_available() and torch.cuda.device_count() > 1: tensor1 = torch.zeros(10, 10, device="cuda:0") tensor2 = torch.ones(10, 10, device="cuda:1") with self.assertRaisesRegex( RuntimeError, "Expected all tensors to be on the same device"): torch._foreach_add([tensor1], [tensor2]) with self.assertRaisesRegex( RuntimeError, "Expected all tensors to be on the same device"): torch._foreach_add_([tensor1], [tensor2]) # Coresponding tensors with different sizes tensors1 = [torch.zeros(10, 10, device=device) for _ in range(10)] tensors2 = [torch.ones(11, 11, device=device) for _ in range(10)] with self.assertRaisesRegex( RuntimeError, "Corresponding tensors in lists must have the same size"): torch._foreach_add(tensors1, tensors2) with self.assertRaisesRegex(RuntimeError, r", got \[10, 10\] and \[11, 11\]"): torch._foreach_add_(tensors1, tensors2)
def test_bin_op_scalar_with_different_tensor_dtypes(self, device): tensors = [ torch.tensor([1.1], dtype=torch.float, device=device), torch.tensor([1], dtype=torch.long, device=device) ] self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, 1))
def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group["amsgrad"] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group["params"]: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( "AdamW does not support sparse gradients") # Perform stepweight decay p.mul_(1 - group["lr"] * group["weight_decay"]) params_with_grad.append(p) grads.append(p.grad) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like( p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.ones_like( p, memory_format=torch.preserve_format ) # torch init to zeros if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state["max_exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format) exp_avg.append(state["exp_avg"]) exp_avg_sq.append(state["exp_avg_sq"]) if amsgrad: max_exp_avg_sq.append(state["max_exp_avg_sq"]) state["step"] += 1 states.append(state) beta1, beta2 = group["betas"] bias_correction1 = [1 - beta1**state["step"] for state in states] bias_correction2 = [1 - beta2**state["step"] for state in states] # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum( max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt( torch._foreach_add(max_exp_avg_sq, group["eps"])) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] denom = torch._foreach_div(max_exp_avg_sq_sqrt, bias_correction_sqrt) else: exp_avg_sq_sqrt = torch._foreach_sqrt( torch._foreach_add(exp_avg_sq, group["eps"])) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] denom = torch._foreach_div(exp_avg_sq_sqrt, bias_correction_sqrt) step_size = [-1 * (group["lr"] / bc) for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] grads = [] params_with_grad = [] states = [] has_sparse_grad = False for p in group['params']: if p.grad is not None: grads.append(p.grad) params_with_grad.append(p) states.append(self.state[p]) if p.grad.is_sparse: has_sparse_grad = True if momentum != 0: raise RuntimeError( 'SGD does not support momentum for sparse gradients' ) if grads == []: return loss if weight_decay != 0: grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay) if momentum != 0: bufs = [] all_states_with_momentum_buffer = True for i in range(len(states)): if 'momentum_buffer' not in states[i]: all_states_with_momentum_buffer = False break else: bufs.append(states[i]['momentum_buffer']) if all_states_with_momentum_buffer: torch._foreach_mul_(bufs, momentum) torch._foreach_add_(bufs, grads, alpha=1 - dampening) else: bufs = [] for i in range(len(states)): if 'momentum_buffer' not in states[i]: buf = states[i]['momentum_buffer'] = torch.clone( grads[i]).detach() else: buf = states[i]['momentum_buffer'] buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) bufs.append(buf) if nesterov: torch._foreach_add_(grads, bufs, alpha=momentum) else: grads = bufs if not has_sparse_grad: torch._foreach_add_(params_with_grad, grads, alpha=-group['lr']) else: # foreach APIs dont support sparse for i in range(len(params_with_grad)): params_with_grad[i].add_(grads[i], alpha=-group['lr']) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: grads = [] params_with_grad = [] states = [] square_avgs = [] acc_deltas = [] rho, eps = group['rho'], group['eps'] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'Adadelta does not support sparse gradients') grads.append(p.grad) params_with_grad.append(p) state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) state['acc_delta'] = torch.zeros_like( p, memory_format=torch.preserve_format) square_avgs.append(state['square_avg']) acc_deltas.append(state['acc_delta']) state['step'] += 1 states.append(state) if group['weight_decay'] != 0: torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) torch._foreach_mul_(square_avgs, rho) torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho) std = torch._foreach_add(square_avgs, eps) torch._foreach_sqrt_(std) deltas = torch._foreach_add(acc_deltas, eps) torch._foreach_sqrt_(deltas) torch._foreach_div_(deltas, std) torch._foreach_mul_(deltas, grads) torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr']) torch._foreach_mul_(acc_deltas, rho) torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho) return loss
def test_add_scalar_with_empty_list(self, device, dtype): tensors = [] with self.assertRaises(RuntimeError): torch._foreach_add(tensors, 1)
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool, capturable: bool): if len(params) == 0: return if capturable: assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \ "If capturable=True, params and state_steps must be CUDA tensors." if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] grads = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in grads ] exp_avgs = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs ] exp_avg_sqs = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avg_sqs ] params = [ torch.view_as_real(x) if torch.is_complex(x) else x for x in params ] # update steps torch._foreach_add_(state_steps, 1) # Perform stepweight decay torch._foreach_mul_(params, 1 - lr * weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if capturable: # TODO: use foreach_pow if/when foreach_pow is added bias_correction1 = [torch.pow(beta1, step) for step in state_steps] bias_correction2 = [torch.pow(beta2, step) for step in state_steps] # foreach_sub doesn't allow a scalar as the first arg torch._foreach_sub_(bias_correction1, 1) torch._foreach_sub_(bias_correction2, 1) torch._foreach_neg_(bias_correction1) torch._foreach_neg_(bias_correction2) # foreach_div doesn't allow a scalar as the first arg step_size = torch._foreach_div(bias_correction1, lr) torch._foreach_reciprocal_(step_size) torch._foreach_neg_(step_size) bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor) torch._foreach_div_( max_exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_( exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size) torch._foreach_addcdiv_(params, exp_avgs, denom) else: bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] step_size = [(lr / bc) * -1 for bc in bias_correction1] bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2] if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)