Exemple #1
0
    def _unscale_main_grads_and_check_for_nan(self):
        main_grads = []
        # fp32 params fromm float16 ones.
        for main_group in self.fp32_from_float16_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)
        # Append fp32 parameters.
        for main_group in self.fp32_from_fp32_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)
        # Reset found inf.
        self.found_inf.fill_(0.0)
        # Unscale and set found inf/nan
        torch._amp_foreach_non_finite_check_and_unscale_(
            main_grads, self.found_inf, self.grad_scaler.inv_scale)
        # Update across all model parallel instances.
        torch.distributed.all_reduce(self.found_inf,
                                     op=torch.distributed.ReduceOp.MAX,
                                     group=mpu.get_model_parallel_group())

        # Check for nan.
        found_inf_flag = (self.found_inf.item() > 0)
        return found_inf_flag
Exemple #2
0
    def _unscale_grads_(
            self,
            optimizer: SGD,
            inv_scale: torch.Tensor,
            found_inf: torch.Tensor,
            allow_fp16: bool = True) -> Dict[torch.device, torch.Tensor]:
        per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale)
        per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf)

        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
        # There could be thousands of grads, so we'd like to iterate through them just once.
        # However, we don't know their devices or dtypes in advance.

        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
        # Google says mypy struggles with defaultdicts type annotations.
        per_device_and_dtype_grads = defaultdict(
            lambda: defaultdict(list))  # type: ignore[var-annotated]
        with torch.no_grad():
            for group in optimizer.param_groups:
                for param in group["params"]:
                    if param.grad is None:
                        continue
                    if (not allow_fp16) and param.grad.dtype == torch.float16:
                        raise ValueError(
                            "Attempting to unscale FP16 gradients.")
                    if param.grad.is_sparse:
                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
                        # coalesce() deduplicates indices and adds all values that have the same index.
                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
                        # so we should check the coalesced _values().
                        if param.grad.dtype is torch.float16:
                            # coalesce is not suported in torch.float16
                            param_grad_fp32 = param.grad.type(
                                torch.float32).coalesce()
                            param.grad = param_grad_fp32.type(torch.float16)
                        to_unscale = param.grad._values()
                    else:
                        to_unscale = param.grad

                    per_device_and_dtype_grads[to_unscale.device][
                        to_unscale.dtype].append(to_unscale)

            for device, per_dtype_grads in per_device_and_dtype_grads.items():
                for grads in per_dtype_grads.values():
                    if grads[0].device.type == "cpu":
                        self._foreach_non_finite_check_and_unscale_cpu_(
                            grads,
                            per_device_found_inf.get(device),
                            per_device_inv_scale.get(device),
                        )
                    else:
                        torch._amp_foreach_non_finite_check_and_unscale_(
                            grads,
                            per_device_found_inf.get(device),
                            per_device_inv_scale.get(device),
                        )
        return per_device_found_inf._per_device_tensors
Exemple #3
0
def check_overflow_for_grads(grad_data):
    found_inf = torch.cuda.FloatTensor([0.0])
    scaler = torch.cuda.FloatTensor([1.0])
    # Unscale and set found inf/nan
    torch._amp_foreach_non_finite_check_and_unscale_(grad_data, found_inf, scaler)

    # Check for nan.
    overflow = found_inf.item() > 0
    return overflow
Exemple #4
0
    def _unscale_main_grads_and_check_for_nan(self):
        main_grads = []
        # fp32 params fromm float16 ones.
        for main_group in self.fp32_from_float16_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)
        # Append fp32 parameters.
        for main_group in self.fp32_from_fp32_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)
        # Reset found inf.
        self.found_inf.fill_(0.0)
        # Unscale and set found inf/nan
        if hasattr(torch, "_amp_foreach_non_finite_check_and_unscale_"):
            torch._amp_foreach_non_finite_check_and_unscale_(
                main_grads, self.found_inf, self.grad_scaler.inv_scale)
        else:
            if self.grad_scaler.inv_scale != 1.0:
                grads = [
                    main_grad for main_grad in main_grads
                    if main_grad is not None
                ]
                _overflow_buf = torch.cuda.IntTensor([0])
                multi_tensor_applier(
                    amp_C.multi_tensor_scale,
                    _overflow_buf,
                    [grads, grads],
                    self.grad_scaler.inv_scale,
                )
                self.found_inf[0] = _overflow_buf[0]

        # Update across all model parallel instances.
        """
        torch.distributed.all_reduce(self.found_inf,
                                     op=torch.distributed.ReduceOp.MAX,
                                     group=mpu.get_model_parallel_group())
        """
        torch.distributed.all_reduce(self.found_inf,
                                     op=torch.distributed.ReduceOp.MAX,
                                     group=smp.get_mp_process_group())

        # Check for nan.
        found_inf_flag = self.found_inf.item() > 0
        return found_inf_flag
    def unscale(self, tensors):
        """Unscale the given tensors. Returns True if any of the gradients were infinite."""
        if not self._enabled:
            return False

        # Invert scale (in higher precision).
        inv_scale = self._scale.double().reciprocal().float()

        # Apply unscaling to tensors, per device.
        tensors_per_device = self._tensors_per_device(tensors)
        for device, device_tensors in tensors_per_device.items():
            found_inf_device = torch.full((1,), 0.0, device=device)
            inv_scale_device = inv_scale.to(device=device)

            torch._amp_foreach_non_finite_check_and_unscale_(
                device_tensors, found_inf_device, inv_scale_device
            )

            self._found_inf += found_inf_device.to(self._found_inf.device)

        return bool(self._found_inf != 0)