Exemple #1
0
    def unscale_method(self, optimizer):
        if not self._enable:
            return
        param_grads = []
        param_grads_fp16 = []
        param_grads_fp32 = []
        if hasattr(optimizer, "update_slice"):
            optimizer.update_slice()
            optimizer.update_scaler = True

        if getattr(optimizer._optim, '_param_groups', None) and isinstance(
                optimizer._optim._param_groups[0], dict):

            for group in optimizer._optim._param_groups:
                for param in group['params']:
                    if param.grad is not None:
                        param_grads.append(param.grad)
                        if param.grad.dtype in [
                                core.VarDesc.VarType.FP16, paddle.float16
                        ]:
                            param_grads_fp16.append(param.grad)
                        else:
                            param_grads_fp32.append(param.grad)
        else:
            for param in optimizer._optim._parameter_list:
                if param.grad is not None:
                    param_grads.append(param.grad)
                    if param.grad.dtype in [
                            core.VarDesc.VarType.FP16, paddle.float16
                    ]:
                        param_grads_fp16.append(param.grad)
                    else:
                        param_grads_fp32.append(param.grad)

        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))

        device = "cpu" if optimizer.offload else "gpu"
        dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[
            1])

        with device_guard(dev_id, device):
            if len(param_grads_fp16):
                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                param_grads_fp16,
                                                temp_found_inf_fp16)
            if len(param_grads_fp32):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                param_grads_fp32,
                                                temp_found_inf_fp32)

        self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")

        paddle.distributed.all_reduce(
            is_found_inf,
            op=paddle.distributed.ReduceOp.MAX,
            group=optimizer._group)
        self._found_inf = is_found_inf.numpy()[0]
Exemple #2
0
    def _unscale(self, optimizer):
        if not self._enable:
            return

        param_grads_dict = defaultdict(list)
        dist_param_grads_dict = defaultdict(list)
        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            for group in optimizer._param_groups:
                for param in group['params']:
                    if not param.is_distributed:
                        if param._grad_ivar() is not None:
                            param_grads_dict[param._grad_ivar().dtype].append(
                                param._grad_ivar())
                    else:
                        if param._grad_ivar() is not None:
                            dist_param_grads_dict[
                                param._grad_ivar().dtype].append(
                                    param._grad_ivar())
        else:
            for param in optimizer._parameter_list:
                if not param.is_distributed:
                    if param._grad_ivar() is not None:
                        param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
                else:
                    if param._grad_ivar() is not None:
                        dist_param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
        for dtype in dist_param_grads_dict:
            for grad in dist_param_grads_dict[dtype]:
                self._found_inf = paddle.logical_not(
                    paddle.all(paddle.isfinite(grad)))
                if self._found_inf:
                    print('Found inf or nan in classifier, dtype is', dtype)
                    return

        for dtype in param_grads_dict:
            param_grads = param_grads_dict[dtype]
            _C_ops.check_finite_and_unscale(param_grads, self._scale,
                                            param_grads, self._found_inf)
            if self._found_inf:
                print('Found inf or nan in backbone, dtype is', dtype)
                break
 def _unscale(self, optimizer):
     if not self._enable:
         return
     param_grads = [
         param._grad_ivar() for param in optimizer._parameter_list
         if param._grad_ivar() is not None
     ]
     _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
                                     self._found_inf)
     # allreduce_max found_inf in check_group
     if not self._use_dp_mode:
         self._found_inf = paddle.cast(self._found_inf, dtype="int32")
         # TODO(shenliang03) Since the minimize call in the optimizer is
         # after the gradscaler, check_finite needs to synchronize global
         # information. In the future, we should use check_group
         paddle.distributed.all_reduce(self._found_inf,
                                       op=paddle.distributed.ReduceOp.MAX,
                                       group=None)
         self._found_inf = paddle.cast(self._found_inf, dtype="bool")
Exemple #4
0
def unscale_method(self, optimizer):
    if not self._enable:
        return

    if getattr(optimizer, '_param_groups', None) and isinstance(
            optimizer._param_groups[0], dict):
        param_grads_fp16 = []
        param_grads_fp32 = []
        for group in optimizer._param_groups:
            for param in group['params']:
                if param._grad_ivar() is not None:
                    if param._grad_ivar().dtype == core.VarDesc.VarType.FP16:
                        param_grads_fp16.append(param._grad_ivar())
                    else:
                        param_grads_fp32.append(param._grad_ivar())
    else:
        param_grads_fp16 = [
            param._grad_ivar() for param in optimizer._parameter_list
            if (param._grad_ivar() is not None) and (
                param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
        ]
        param_grads_fp32 = [
            param._grad_ivar() for param in optimizer._parameter_list
            if (param._grad_ivar() is not None) and (
                param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
        ]
    temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
    temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))

    if len(param_grads_fp16):
        _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                        param_grads_fp16, temp_found_inf_fp16)
    if len(param_grads_fp32):
        _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                        param_grads_fp32, temp_found_inf_fp32)
    self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0

    if dist.get_world_size() > 1:
        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
        paddle.distributed.all_reduce(is_found_inf,
                                      op=paddle.distributed.ReduceOp.MAX,
                                      group=None)
        self._found_inf = is_found_inf.numpy()[0]
    def sync_gradient_and_unscale(self, optimizer):
        if self.world_size <= 1 and self.grad_norm_clip is None and not self._enable:
            return

        # data parallel
        param_grads_dict = defaultdict(list)
        # model parallel
        dist_param_grads_dict = defaultdict(list)

        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            for group in optimizer._param_groups:
                for param in group['params']:
                    if not param.is_distributed:
                        if param._grad_ivar() is not None:
                            param_grads_dict[param._grad_ivar().dtype].append(
                                param._grad_ivar())
                    else:
                        if param._grad_ivar() is not None:
                            dist_param_grads_dict[param._grad_ivar(
                            ).dtype].append(param._grad_ivar())
                        elif getattr(param, 'sparse_grad', None) is not None:
                            grad = getattr(param, 'sparse_grad')
                            dist_param_grads_dict[grad.dtype].append(grad)
        else:
            for param in optimizer._parameter_list:
                if not param.is_distributed:
                    if param._grad_ivar() is not None:
                        param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
                else:
                    if param._grad_ivar() is not None:
                        dist_param_grads_dict[param._grad_ivar().dtype].append(
                            param._grad_ivar())
                    elif getattr(param, 'sparse_grad', None) is not None:
                        grad = getattr(param, 'sparse_grad')
                        dist_param_grads_dict[grad.dtype].append(grad)

        if self._enable:
            for dtype in dist_param_grads_dict:
                for grad in dist_param_grads_dict[dtype]:
                    self._found_inf = paddle.logical_not(
                        paddle.all(paddle.isfinite(grad)))
                    if self._found_inf:
                        print(
                            'Found inf or nan of distributed parameter, dtype is',
                            dtype)
                        return

        grads_fp32 = []
        grads_fp16 = []
        if len(param_grads_dict[paddle.float32]) > 0:
            coalesced_grads_and_vars_fp32 = \
                paddle.fluid.dygraph.parallel.build_groups(param_grads_dict[paddle.float32], 128 * 1024 * 1024)
            for coalesced_grad, _, _ in coalesced_grads_and_vars_fp32:
                if self.world_size > 1:
                    paddle.distributed.all_reduce(coalesced_grad)
                grads_fp32.append(coalesced_grad)

            if self._enable:
                _C_ops.check_finite_and_unscale(grads_fp32, self._scale,
                                                grads_fp32, self._found_inf)
                if self._found_inf:
                    print(
                        'Found inf or nan of non distributed parameter, dtype is',
                        paddle.float32)
                    return

        if len(param_grads_dict[paddle.float16]) > 0:
            coalesced_grads_and_vars_fp16 = \
                paddle.fluid.dygraph.parallel.build_groups(param_grads_dict[paddle.float16], 128 * 1024 * 1024)
            for coalesced_grad, _, _ in coalesced_grads_and_vars_fp16:
                if self.world_size > 1:
                    paddle.distributed.all_reduce(coalesced_grad)
                grads_fp16.append(coalesced_grad)

            if self._enable:
                _C_ops.check_finite_and_unscale(grads_fp16, self._scale,
                                                grads_fp16, self._found_inf)
                if self._found_inf:
                    print(
                        'Found inf or nan non distributed parameter, dtype is',
                        paddle.float16)
                    return

        if self.grad_norm_clip is not None:
            clip_grad_norm_(grads_fp32, grads_fp16, self.grad_norm_clip,
                            self.grad_norm_clip_max)

        if len(param_grads_dict[paddle.float16]) > 0:
            paddle.fluid.dygraph.parallel._split_tensors(
                coalesced_grads_and_vars_fp16)
        if len(param_grads_dict[paddle.float32]) > 0:
            paddle.fluid.dygraph.parallel._split_tensors(
                coalesced_grads_and_vars_fp32)
Exemple #6
0
    def _unscale(self, optimizer):
        """
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
        Returns:
            The unscaled parameters or original parameters.
        """
        if not self._enable:
            return

        optimizer_state = self._optimizer_states[id(optimizer)]

        if optimizer_state["state"] is OptimizerState.UNSCALED:
            raise RuntimeError(
                "unscale_() has already been called on this optimizer since the last update()."
            )
        elif optimizer_state["state"] is OptimizerState.STEPPED:
            raise RuntimeError("unscale_() is being called after step().")

        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            param_grads = []
            param_grads_fp16 = []
            param_grads_fp32 = []
            for group in optimizer._param_groups:
                for param in group['params']:
                    if param._grad_ivar() is not None:
                        param_grads.append(param._grad_ivar())
                        if param._grad_ivar(
                        ).dtype == core.VarDesc.VarType.FP16:
                            param_grads_fp16.append(param._grad_ivar())
                        else:
                            param_grads_fp32.append(param._grad_ivar())
        else:
            param_grads = [
                param._grad_ivar() for param in optimizer._parameter_list
                if param._grad_ivar() is not None
            ]
            param_grads_fp16 = [
                param._grad_ivar() for param in optimizer._parameter_list
                if (param._grad_ivar() is not None) and (
                    param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
            ]
            param_grads_fp32 = [
                param._grad_ivar() for param in optimizer._parameter_list
                if (param._grad_ivar() is not None) and (
                    param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
            ]
        if core.is_compiled_with_npu():
            float_status = _C_ops.alloc_float_status()
            _C_ops.clear_float_status(float_status, float_status)

            if len(param_grads_fp16):
                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                float_status, param_grads_fp16,
                                                self._temp_found_inf_fp16)
            if len(param_grads_fp32):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                float_status, param_grads_fp32,
                                                self._temp_found_inf_fp32)
        else:
            if len(param_grads_fp16):
                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                param_grads_fp16,
                                                self._temp_found_inf_fp16)
            if len(param_grads_fp32):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                param_grads_fp32,
                                                self._temp_found_inf_fp32)

        if len(param_grads_fp16) and len(param_grads_fp32):
            self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
        elif len(param_grads_fp16):
            self._found_inf = self._temp_found_inf_fp16
        else:
            self._found_inf = self._temp_found_inf_fp32

        optimizer_state["state"] = OptimizerState.UNSCALED