def _unscale_main_grads_and_check_for_nan(self): main_grads = [] # fp32 params fromm float16 ones. for main_group in self.fp32_from_float16_groups: for main_param in main_group: if main_param.grad is not None: main_grads.append(main_param.grad.data) # Append fp32 parameters. for main_group in self.fp32_from_fp32_groups: for main_param in main_group: if main_param.grad is not None: main_grads.append(main_param.grad.data) # Reset found inf. self.found_inf.fill_(0.0) # Unscale and set found inf/nan torch._amp_foreach_non_finite_check_and_unscale_( main_grads, self.found_inf, self.grad_scaler.inv_scale) # Update across all model parallel instances. torch.distributed.all_reduce(self.found_inf, op=torch.distributed.ReduceOp.MAX, group=mpu.get_model_parallel_group()) # Check for nan. found_inf_flag = (self.found_inf.item() > 0) return found_inf_flag
def _unscale_grads_( self, optimizer: SGD, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool = True) -> Dict[torch.device, torch.Tensor]: per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale) per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf) # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype. # There could be thousands of grads, so we'd like to iterate through them just once. # However, we don't know their devices or dtypes in advance. # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict # Google says mypy struggles with defaultdicts type annotations. per_device_and_dtype_grads = defaultdict( lambda: defaultdict(list)) # type: ignore[var-annotated] with torch.no_grad(): for group in optimizer.param_groups: for param in group["params"]: if param.grad is None: continue if (not allow_fp16) and param.grad.dtype == torch.float16: raise ValueError( "Attempting to unscale FP16 gradients.") if param.grad.is_sparse: # is_coalesced() == False means the sparse grad has values with duplicate indices. # coalesce() deduplicates indices and adds all values that have the same index. # For scaled fp16 values, there's a good chance coalescing will cause overflow, # so we should check the coalesced _values(). if param.grad.dtype is torch.float16: # coalesce is not suported in torch.float16 param_grad_fp32 = param.grad.type( torch.float32).coalesce() param.grad = param_grad_fp32.type(torch.float16) to_unscale = param.grad._values() else: to_unscale = param.grad per_device_and_dtype_grads[to_unscale.device][ to_unscale.dtype].append(to_unscale) for device, per_dtype_grads in per_device_and_dtype_grads.items(): for grads in per_dtype_grads.values(): if grads[0].device.type == "cpu": self._foreach_non_finite_check_and_unscale_cpu_( grads, per_device_found_inf.get(device), per_device_inv_scale.get(device), ) else: torch._amp_foreach_non_finite_check_and_unscale_( grads, per_device_found_inf.get(device), per_device_inv_scale.get(device), ) return per_device_found_inf._per_device_tensors
def check_overflow_for_grads(grad_data): found_inf = torch.cuda.FloatTensor([0.0]) scaler = torch.cuda.FloatTensor([1.0]) # Unscale and set found inf/nan torch._amp_foreach_non_finite_check_and_unscale_(grad_data, found_inf, scaler) # Check for nan. overflow = found_inf.item() > 0 return overflow
def _unscale_main_grads_and_check_for_nan(self): main_grads = [] # fp32 params fromm float16 ones. for main_group in self.fp32_from_float16_groups: for main_param in main_group: if main_param.grad is not None: main_grads.append(main_param.grad.data) # Append fp32 parameters. for main_group in self.fp32_from_fp32_groups: for main_param in main_group: if main_param.grad is not None: main_grads.append(main_param.grad.data) # Reset found inf. self.found_inf.fill_(0.0) # Unscale and set found inf/nan if hasattr(torch, "_amp_foreach_non_finite_check_and_unscale_"): torch._amp_foreach_non_finite_check_and_unscale_( main_grads, self.found_inf, self.grad_scaler.inv_scale) else: if self.grad_scaler.inv_scale != 1.0: grads = [ main_grad for main_grad in main_grads if main_grad is not None ] _overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier( amp_C.multi_tensor_scale, _overflow_buf, [grads, grads], self.grad_scaler.inv_scale, ) self.found_inf[0] = _overflow_buf[0] # Update across all model parallel instances. """ torch.distributed.all_reduce(self.found_inf, op=torch.distributed.ReduceOp.MAX, group=mpu.get_model_parallel_group()) """ torch.distributed.all_reduce(self.found_inf, op=torch.distributed.ReduceOp.MAX, group=smp.get_mp_process_group()) # Check for nan. found_inf_flag = self.found_inf.item() > 0 return found_inf_flag
def unscale(self, tensors): """Unscale the given tensors. Returns True if any of the gradients were infinite.""" if not self._enabled: return False # Invert scale (in higher precision). inv_scale = self._scale.double().reciprocal().float() # Apply unscaling to tensors, per device. tensors_per_device = self._tensors_per_device(tensors) for device, device_tensors in tensors_per_device.items(): found_inf_device = torch.full((1,), 0.0, device=device) inv_scale_device = inv_scale.to(device=device) torch._amp_foreach_non_finite_check_and_unscale_( device_tensors, found_inf_device, inv_scale_device ) self._found_inf += found_inf_device.to(self._found_inf.device) return bool(self._found_inf != 0)