def test_min_max(self, device, dtype): for N in N_values: tensors1 = self._get_test_data(device, dtype, N) tensors2 = self._get_test_data(device, dtype, N) # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. control_dtype = torch.float32 if ( self.device_type == 'cuda' and (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype expected_max = [ torch.max(tensors1[i].to(dtype=control_dtype), tensors2[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N) ] expected_min = [ torch.min(tensors1[i].to(dtype=control_dtype), tensors2[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N) ] res_max = torch._foreach_maximum(tensors1, tensors2) self.assertEqual(res_max, expected_max) res_min = torch._foreach_minimum(tensors1, tensors2) self.assertEqual(res_min, expected_min)
def _multi_tensor_adam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool): if len(params) == 0: return # update steps torch._foreach_add_(state_steps, 1) if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] if weight_decay != 0: torch._foreach_add_(grads, params, alpha=weight_decay) torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) step_size = [(lr / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def test_max_min_inf_nan(self, device, dtype): a = [ torch.tensor([inf], device=device, dtype=dtype), torch.tensor([-inf], device=device, dtype=dtype), torch.tensor([nan], device=device, dtype=dtype), torch.tensor([nan], device=device, dtype=dtype) ] b = [ torch.tensor([-inf], device=device, dtype=dtype), torch.tensor([inf], device=device, dtype=dtype), torch.tensor([inf], device=device, dtype=dtype), torch.tensor([nan], device=device, dtype=dtype) ] expected_max = [torch.max(a1, b1) for a1, b1 in zip(a, b)] res_max = torch._foreach_maximum(a, b) self.assertEqual(expected_max, res_max) expected_min = [torch.min(a1, b1) for a1, b1 in zip(a, b)] res_min = torch._foreach_minimum(a, b) self.assertEqual(expected_min, res_min)
def test_max_min_float_inf_nan(self, device, dtype): a = [ torch.tensor([float('inf')], device=device, dtype=dtype), torch.tensor([-float('inf')], device=device, dtype=dtype), torch.tensor([float('nan')], device=device, dtype=dtype), torch.tensor([float('nan')], device=device, dtype=dtype) ] b = [ torch.tensor([-float('inf')], device=device, dtype=dtype), torch.tensor([float('inf')], device=device, dtype=dtype), torch.tensor([float('inf')], device=device, dtype=dtype), torch.tensor([float('nan')], device=device, dtype=dtype) ] expected = [torch.max(a1, b1) for a1, b1 in zip(a, b)] res = torch._foreach_maximum(a, b) self.assertEqual(expected, res) expected = [torch.min(a1, b1) for a1, b1 in zip(a, b)] res = torch._foreach_minimum(a, b) self.assertEqual(expected, res)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group['amsgrad'] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group['params']: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( 'Adam does not support sparse gradients, please consider SparseAdam instead' ) params_with_grad.append(p) grads.append(p.grad) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like( p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like( p, memory_format=torch.preserve_format) exp_avg.append(state['exp_avg']) exp_avg_sq.append(state['exp_avg_sq']) if amsgrad: max_exp_avg_sq.append(state['max_exp_avg_sq']) state['step'] += 1 states.append(state) beta1, beta2 = group['betas'] bias_correction1 = [1 - beta1**state['step'] for state in states] bias_correction2 = [1 - beta2**state['step'] for state in states] if group['weight_decay'] != 0: grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay']) # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum( max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], state_steps: List[Tensor], *, amsgrad: bool, beta1: float, beta2: float, lr: float, weight_decay: float, eps: float, maximize: bool, capturable: bool): if len(params) == 0: return if capturable: assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \ "If capturable=True, params and state_steps must be CUDA tensors." if maximize: grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] # update steps torch._foreach_add_(state_steps, 1) # Perform stepweight decay torch._foreach_mul_(params, 1 - lr * weight_decay) # Decay the first and second moment running average coefficient torch._foreach_mul_(exp_avgs, beta1) torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sqs, beta2) torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) if capturable: # TODO: use foreach_pow if/when foreach_pow is added bias_correction1 = [torch.pow(beta1, step) for step in state_steps] bias_correction2 = [torch.pow(beta2, step) for step in state_steps] # foreach_sub doesn't allow a scalar as the first arg torch._foreach_sub_(bias_correction1, 1) torch._foreach_sub_(bias_correction2, 1) torch._foreach_neg_(bias_correction1) torch._foreach_neg_(bias_correction2) # foreach_div doesn't allow a scalar as the first arg step_size = torch._foreach_div(bias_correction1, lr) torch._foreach_reciprocal_(step_size) torch._foreach_neg_(step_size) bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum( max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor) torch._foreach_div_( max_exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_( exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)) eps_over_step_size = torch._foreach_div(step_size, eps) torch._foreach_reciprocal_(eps_over_step_size) denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size) torch._foreach_addcdiv_(params, exp_avgs, denom) else: bias_correction1 = [1 - beta1**step.item() for step in state_steps] bias_correction2 = [1 - beta2**step.item() for step in state_steps] step_size = [(lr / bc) * -1 for bc in bias_correction1] bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2] if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sqs = torch._foreach_maximum( max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) else: exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt) denom = torch._foreach_add(exp_avg_sq_sqrt, eps) torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: amsgrad = group["amsgrad"] grads = [] states = [] exp_avg = [] exp_avg_sq = [] max_exp_avg_sq = [] params_with_grad = [] for p in group["params"]: if p.grad is not None: if p.grad.is_sparse: raise RuntimeError( "AdamW does not support sparse gradients") # Perform stepweight decay p.mul_(1 - group["lr"] * group["weight_decay"]) params_with_grad.append(p) grads.append(p.grad) for p in params_with_grad: state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like( p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.ones_like( p, memory_format=torch.preserve_format ) # torch init to zeros if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state["max_exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format) exp_avg.append(state["exp_avg"]) exp_avg_sq.append(state["exp_avg_sq"]) if amsgrad: max_exp_avg_sq.append(state["max_exp_avg_sq"]) state["step"] += 1 states.append(state) beta1, beta2 = group["betas"] bias_correction1 = [1 - beta1**state["step"] for state in states] bias_correction2 = [1 - beta2**state["step"] for state in states] # # Decay the first and second moment running average coefficient # torch._foreach_mul_(exp_avg, beta1) torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) torch._foreach_mul_(exp_avg_sq, beta2) torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now max_exp_avg_sq = torch._foreach_maximum( max_exp_avg_sq, exp_avg_sq) # Use the max. for normalizing running avg. of gradient max_exp_avg_sq_sqrt = torch._foreach_sqrt( torch._foreach_add(max_exp_avg_sq, group["eps"])) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] denom = torch._foreach_div(max_exp_avg_sq_sqrt, bias_correction_sqrt) else: exp_avg_sq_sqrt = torch._foreach_sqrt( torch._foreach_add(exp_avg_sq, group["eps"])) bias_correction_sqrt = [ math.sqrt(bc) for bc in bias_correction2 ] denom = torch._foreach_div(exp_avg_sq_sqrt, bias_correction_sqrt) step_size = [-1 * (group["lr"] / bc) for bc in bias_correction1] torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) return loss