def unscale_method(self, optimizer): if not self._enable: return param_grads = [] param_grads_fp16 = [] param_grads_fp32 = [] if hasattr(optimizer, "update_slice"): optimizer.update_slice() optimizer.update_scaler = True if getattr(optimizer._optim, '_param_groups', None) and isinstance( optimizer._optim._param_groups[0], dict): for group in optimizer._optim._param_groups: for param in group['params']: if param.grad is not None: param_grads.append(param.grad) if param.grad.dtype in [ core.VarDesc.VarType.FP16, paddle.float16 ]: param_grads_fp16.append(param.grad) else: param_grads_fp32.append(param.grad) else: for param in optimizer._optim._parameter_list: if param.grad is not None: param_grads.append(param.grad) if param.grad.dtype in [ core.VarDesc.VarType.FP16, paddle.float16 ]: param_grads_fp16.append(param.grad) else: param_grads_fp32.append(param.grad) temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) device = "cpu" if optimizer.offload else "gpu" dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[ 1]) with device_guard(dev_id, device): if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, param_grads_fp16, temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, param_grads_fp32, temp_found_inf_fp32) self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") paddle.distributed.all_reduce( is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=optimizer._group) self._found_inf = is_found_inf.numpy()[0]
def _unscale(self, optimizer): if not self._enable: return param_grads_dict = defaultdict(list) dist_param_grads_dict = defaultdict(list) if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): for group in optimizer._param_groups: for param in group['params']: if not param.is_distributed: if param._grad_ivar() is not None: param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) else: if param._grad_ivar() is not None: dist_param_grads_dict[ param._grad_ivar().dtype].append( param._grad_ivar()) else: for param in optimizer._parameter_list: if not param.is_distributed: if param._grad_ivar() is not None: param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) else: if param._grad_ivar() is not None: dist_param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) for dtype in dist_param_grads_dict: for grad in dist_param_grads_dict[dtype]: self._found_inf = paddle.logical_not( paddle.all(paddle.isfinite(grad))) if self._found_inf: print('Found inf or nan in classifier, dtype is', dtype) return for dtype in param_grads_dict: param_grads = param_grads_dict[dtype] _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads, self._found_inf) if self._found_inf: print('Found inf or nan in backbone, dtype is', dtype) break
def _unscale(self, optimizer): if not self._enable: return param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads, self._found_inf) # allreduce_max found_inf in check_group if not self._use_dp_mode: self._found_inf = paddle.cast(self._found_inf, dtype="int32") # TODO(shenliang03) Since the minimize call in the optimizer is # after the gradscaler, check_finite needs to synchronize global # information. In the future, we should use check_group paddle.distributed.all_reduce(self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) self._found_inf = paddle.cast(self._found_inf, dtype="bool")
def unscale_method(self, optimizer): if not self._enable: return if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads_fp16 = [] param_grads_fp32 = [] for group in optimizer._param_groups: for param in group['params']: if param._grad_ivar() is not None: if param._grad_ivar().dtype == core.VarDesc.VarType.FP16: param_grads_fp16.append(param._grad_ivar()) else: param_grads_fp32.append(param._grad_ivar()) else: param_grads_fp16 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP16) ] param_grads_fp32 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP32) ] temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, param_grads_fp16, temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, param_grads_fp32, temp_found_inf_fp32) self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 if dist.get_world_size() > 1: is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") paddle.distributed.all_reduce(is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) self._found_inf = is_found_inf.numpy()[0]
def sync_gradient_and_unscale(self, optimizer): if self.world_size <= 1 and self.grad_norm_clip is None and not self._enable: return # data parallel param_grads_dict = defaultdict(list) # model parallel dist_param_grads_dict = defaultdict(list) if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): for group in optimizer._param_groups: for param in group['params']: if not param.is_distributed: if param._grad_ivar() is not None: param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) else: if param._grad_ivar() is not None: dist_param_grads_dict[param._grad_ivar( ).dtype].append(param._grad_ivar()) elif getattr(param, 'sparse_grad', None) is not None: grad = getattr(param, 'sparse_grad') dist_param_grads_dict[grad.dtype].append(grad) else: for param in optimizer._parameter_list: if not param.is_distributed: if param._grad_ivar() is not None: param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) else: if param._grad_ivar() is not None: dist_param_grads_dict[param._grad_ivar().dtype].append( param._grad_ivar()) elif getattr(param, 'sparse_grad', None) is not None: grad = getattr(param, 'sparse_grad') dist_param_grads_dict[grad.dtype].append(grad) if self._enable: for dtype in dist_param_grads_dict: for grad in dist_param_grads_dict[dtype]: self._found_inf = paddle.logical_not( paddle.all(paddle.isfinite(grad))) if self._found_inf: print( 'Found inf or nan of distributed parameter, dtype is', dtype) return grads_fp32 = [] grads_fp16 = [] if len(param_grads_dict[paddle.float32]) > 0: coalesced_grads_and_vars_fp32 = \ paddle.fluid.dygraph.parallel.build_groups(param_grads_dict[paddle.float32], 128 * 1024 * 1024) for coalesced_grad, _, _ in coalesced_grads_and_vars_fp32: if self.world_size > 1: paddle.distributed.all_reduce(coalesced_grad) grads_fp32.append(coalesced_grad) if self._enable: _C_ops.check_finite_and_unscale(grads_fp32, self._scale, grads_fp32, self._found_inf) if self._found_inf: print( 'Found inf or nan of non distributed parameter, dtype is', paddle.float32) return if len(param_grads_dict[paddle.float16]) > 0: coalesced_grads_and_vars_fp16 = \ paddle.fluid.dygraph.parallel.build_groups(param_grads_dict[paddle.float16], 128 * 1024 * 1024) for coalesced_grad, _, _ in coalesced_grads_and_vars_fp16: if self.world_size > 1: paddle.distributed.all_reduce(coalesced_grad) grads_fp16.append(coalesced_grad) if self._enable: _C_ops.check_finite_and_unscale(grads_fp16, self._scale, grads_fp16, self._found_inf) if self._found_inf: print( 'Found inf or nan non distributed parameter, dtype is', paddle.float16) return if self.grad_norm_clip is not None: clip_grad_norm_(grads_fp32, grads_fp16, self.grad_norm_clip, self.grad_norm_clip_max) if len(param_grads_dict[paddle.float16]) > 0: paddle.fluid.dygraph.parallel._split_tensors( coalesced_grads_and_vars_fp16) if len(param_grads_dict[paddle.float32]) > 0: paddle.fluid.dygraph.parallel._split_tensors( coalesced_grads_and_vars_fp32)
def _unscale(self, optimizer): """ Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. Args: optimizer(Optimizer): The optimizer used to update parameters. Returns: The unscaled parameters or original parameters. """ if not self._enable: return optimizer_state = self._optimizer_states[id(optimizer)] if optimizer_state["state"] is OptimizerState.UNSCALED: raise RuntimeError( "unscale_() has already been called on this optimizer since the last update()." ) elif optimizer_state["state"] is OptimizerState.STEPPED: raise RuntimeError("unscale_() is being called after step().") if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads = [] param_grads_fp16 = [] param_grads_fp32 = [] for group in optimizer._param_groups: for param in group['params']: if param._grad_ivar() is not None: param_grads.append(param._grad_ivar()) if param._grad_ivar( ).dtype == core.VarDesc.VarType.FP16: param_grads_fp16.append(param._grad_ivar()) else: param_grads_fp32.append(param._grad_ivar()) else: param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] param_grads_fp16 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP16) ] param_grads_fp32 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP32) ] if core.is_compiled_with_npu(): float_status = _C_ops.alloc_float_status() _C_ops.clear_float_status(float_status, float_status) if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, float_status, param_grads_fp16, self._temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, float_status, param_grads_fp32, self._temp_found_inf_fp32) else: if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, param_grads_fp16, self._temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, param_grads_fp32, self._temp_found_inf_fp32) if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): self._found_inf = self._temp_found_inf_fp16 else: self._found_inf = self._temp_found_inf_fp32 optimizer_state["state"] = OptimizerState.UNSCALED