def __init__(self, logits, name=None): """ Args: logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64. name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. """ if not _non_static_mode(): check_type(logits, 'logits', (np.ndarray, tensor.Variable, list, tuple), 'Categorical') self.name = name if name is not None else 'Categorical' self.dtype = 'float32' if self._validate_args(logits): self.logits = logits self.dtype = convert_dtype(logits.dtype) else: if isinstance(logits, np.ndarray) and str( logits.dtype) in ['float32', 'float64']: self.dtype = logits.dtype self.logits = self._to_tensor(logits)[0] if self.dtype != convert_dtype(self.logits.dtype): self.logits = tensor.cast(self.logits, dtype=self.dtype) dist_sum = paddle.sum(self.logits, axis=-1, keepdim=True) self._prob = self.logits / dist_sum
def update_model_kwargs_for_generation(outputs, model_kwargs): """ Update the model inputs during generation. Note that If `token_type_ids` and `attention_mask` in `model_kwargs` and they contain pad value, the result vectors updated by this method may be different from expected. In this case, you need to rewrite the method. """ # update cache if isinstance(outputs, tuple): model_kwargs["cache"] = outputs[1] # update token_type_ids with last value if "token_type_ids" in model_kwargs: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.concat( [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], axis=-1) # update position_ids if "position_ids" in model_kwargs: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.concat([ position_ids, paddle.max(position_ids, axis=-1, keepdim=True) + 1 ], axis=-1) # update attention_mask if "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] # nn.Pad2D don't support the data type `bool` if convert_dtype(attention_mask.dtype) == 'bool': attention_mask = paddle.cast(attention_mask, 'int64') attention_mask = nn.Pad2D([0, 0, 0, 1], mode='replicate')(attention_mask) attention_mask = nn.Pad2D([0, 1, 0, 0], value=-1e9)(attention_mask) dtype = convert_dtype(attention_mask.dtype) if 'int' in dtype: attention_mask[:, :, -1, -1] = 1 elif 'float' in dtype: attention_mask[:, :, -1, -1] = 0.0 else: raise ValueError( 'The data type of input `attention_mask` must ' 'be bool, int or float') model_kwargs["attention_mask"] = attention_mask return model_kwargs
def __check_out__(self, out): data_type = convert_dtype(out.dtype) self.assertEqual( data_type, self.dst_dtype, 'dtype should be %s, but get %s' % (self.dst_dtype, data_type)) shape = out.shape self.assertTupleEqual( shape, self.dst_shape, 'shape should be %s, but get %s' % (self.dst_shape, shape)) if data_type in ['float32', 'float64', 'int32', 'int64']: max_value = np.nanmax(out) min_value = np.nanmin(out) always_non_full_zero = max_value > min_value always_full_zero = max_value == 0.0 and min_value == 0.0 self.assertTrue(always_full_zero or always_non_full_zero, 'always_full_zero or always_non_full_zero.') elif data_type in ['bool']: total_num = out.size true_num = np.sum(out == True) false_num = np.sum(out == False) self.assertTrue(total_num == true_num + false_num, 'The value should always be True or False.') else: self.assertTrue(False, 'invalid data type')
def update_model_kwargs_for_generation(outputs, model_kwargs): # update cache if isinstance(outputs, tuple): model_kwargs["cache"] = outputs[1] # update token_type_ids with last value if "token_type_ids" in model_kwargs: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.concat( [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], axis=-1) # update position_ids if "position_ids" in model_kwargs: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.concat( [position_ids, position_ids[:, -1].unsqueeze(-1) + 1], axis=-1) # update attention_mask if "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] # TODO attention_mask = nn.Pad2D([0, 0, 0, 1], mode='replicate')(attention_mask) attention_mask = nn.Pad2D([0, 1, 0, 0], value=-1e9)(attention_mask) dtype = convert_dtype(attention_mask.dtype) if dtype == 'bool': attention_mask[:, :, -1, -1] = True elif 'int' in dtype: attention_mask[:, :, -1, -1] = 1 else: attention_mask[:, :, -1, -1] = 0.0 model_kwargs["attention_mask"] = attention_mask return model_kwargs
def to_string(var, prefix='Tensor'): indent = len(prefix) + 1 _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" tensor = var.value().get_tensor() if not tensor._is_initialized(): return "Tensor(Not initialized)" np_var = var.numpy() if len(var.shape) == 0: size = 0 else: size = 1 for dim in var.shape: size *= dim summary = False if size > DEFAULT_PRINT_OPTIONS.threshold: summary = True max_width, signed = _get_max_width(_to_summary(np_var)) data = _format_tensor( np_var, summary, indent=indent, max_width=max_width, signed=signed) return _template.format( prefix=prefix, shape=var.shape, dtype=convert_dtype(var.dtype), place=var._place_str, stop_gradient=var.stop_gradient, indent=' ' * indent, data=data)
def _check_values_dtype_in_probs(self, param, value): """ Log_prob and probs methods have input ``value``, if value's dtype is different from param, convert value's dtype to be consistent with param's dtype. Args: param (Tensor): low and high in Uniform class, loc and scale in Normal class. value (Tensor): The input tensor. Returns: value (Tensor): Change value's dtype if value's dtype is different from param. """ if _non_static_mode(): if value.dtype != param.dtype and convert_dtype( value.dtype) in ['float32', 'float64']: warnings.warn( "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted." ) return _C_ops.cast(value, 'in_dtype', value.dtype, 'out_dtype', param.dtype) return value check_variable_and_dtype(value, 'value', ['float32', 'float64'], 'log_prob') if value.dtype != param.dtype: warnings.warn( "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted." ) return tensor.cast(value, dtype=param.dtype) return value
def _convert_attention_mask(attn_mask, dtype): if attn_mask is not None and attn_mask.dtype != dtype: attn_mask_dtype = convert_dtype(attn_mask.dtype) if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype: attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9 else: attn_mask = paddle.cast(attn_mask, dtype) return attn_mask
def __init__(self, loc, scale, name=None): if not _non_static_mode(): check_type(loc, 'loc', (int, float, np.ndarray, tensor.Variable, list, tuple), 'Normal') check_type(scale, 'scale', (int, float, np.ndarray, tensor.Variable, list, tuple), 'Normal') self.batch_size_unknown = False self.all_arg_is_float = False self.name = name if name is not None else 'Normal' self.dtype = 'float32' if isinstance(loc, int): loc = float(loc) if isinstance(scale, int): scale = float(scale) if self._validate_args(loc, scale): self.batch_size_unknown = True self.loc = loc self.scale = scale self.dtype = convert_dtype(loc.dtype) else: if isinstance(loc, float) and isinstance(scale, float): self.all_arg_is_float = True if isinstance(loc, np.ndarray) and str( loc.dtype) in ['float32', 'float64']: self.dtype = loc.dtype elif isinstance(scale, np.ndarray) and str( scale.dtype) in ['float32', 'float64']: self.dtype = scale.dtype # pylint: disable=unbalanced-tuple-unpacking self.loc, self.scale = self._to_tensor(loc, scale) if self.dtype != convert_dtype(self.loc.dtype): self.loc = tensor.cast(self.loc, dtype=self.dtype) self.scale = tensor.cast(self.scale, dtype=self.dtype) super(Normal, self).__init__(self.loc.shape)
def __init__(self, low, high, name=None): if not _non_static_mode(): check_type(low, 'low', (int, float, np.ndarray, tensor.Variable, list, tuple), 'Uniform') check_type(high, 'high', (int, float, np.ndarray, tensor.Variable, list, tuple), 'Uniform') self.all_arg_is_float = False self.batch_size_unknown = False self.name = name if name is not None else 'Uniform' self.dtype = 'float32' if isinstance(low, int): low = float(low) if isinstance(high, int): high = float(high) if self._validate_args(low, high): self.batch_size_unknown = True self.low = low self.high = high self.dtype = convert_dtype(low.dtype) else: if isinstance(low, float) and isinstance(high, float): self.all_arg_is_float = True if isinstance(low, np.ndarray) and str( low.dtype) in ['float32', 'float64']: self.dtype = low.dtype elif isinstance(high, np.ndarray) and str( high.dtype) in ['float32', 'float64']: self.dtype = high.dtype # pylint: disable=unbalanced-tuple-unpacking self.low, self.high = self._to_tensor(low, high) if self.dtype != convert_dtype(self.low.dtype): self.low = tensor.cast(self.low, dtype=self.dtype) self.high = tensor.cast(self.high, dtype=self.dtype)
def convert_var_dtype(var, dtype): if isinstance(var, Variable): src_dtype = convert_dtype(var.dtype) assert src_dtype in [ 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8' ], "The dtype of var {} is {}, which is not supported in the cast op.".format( var.name, src_dtype) assert dtype in [ 'bool', 'int', 'float' ], "The casted target dtype is {}, which is not supported in type casting.".format( dtype) cast_map = { 'bool': 'bool', 'int': 'int32', 'float': 'float32', } return cast(var, dtype=cast_map[dtype]) else: return eval('{}(var)'.format(dtype))
def tensor_to_string(tensor, prefix='Tensor'): indent = len(prefix) + 1 dtype = convert_dtype(tensor.dtype) if tensor.dtype == core.VarDesc.VarType.BF16: dtype = 'bfloat16' _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" if tensor.is_sparse(): return sparse_tensor_to_string(tensor, prefix) if not tensor._is_dense_tensor_hold_allocation(): return "Tensor(Not initialized)" else: data = _format_dense_tensor(tensor, indent) return _template.format( prefix=prefix, shape=tensor.shape, dtype=dtype, place=tensor._place_str, stop_gradient=tensor.stop_gradient, indent=' ' * indent, data=data)
def test_convert_dtype(self): self.assertEqual(convert_dtype(core.VarDesc.VarType.COMPLEX64), "complex64") self.assertEqual(convert_dtype(core.VarDesc.VarType.COMPLEX128), "complex128")
def cast_bool_if_necessary(var): assert isinstance(var, Variable) if convert_dtype(var.dtype) not in ['bool']: var = cast(var, dtype="bool") return var
def _handle_dtype(data, dtype): if dtype: if convert_dtype(dtype) != convert_dtype(data.dtype): return data.astype(convert_dtype(dtype)) return data
def load(self, backbone: paddle.nn.Layer, classifier: paddle.nn.Layer = None, optimizer=None, for_train=True, dtype=None): assert os.path.exists(self.checkpoint_dir) checkpoint_dir = os.path.abspath(self.checkpoint_dir) type_dict = {} for name, param in backbone.state_dict().items(): type_dict[param.name] = convert_dtype(param.dtype) if classifier is not None: # for dist param, we need to save their at all ranks. for name, param in classifier.state_dict().items(): type_dict[param.name] = convert_dtype(param.dtype) if for_train: assert optimizer is not None opt_state_dict = optimizer.state_dict() lr_state_dict = opt_state_dict['LR_Scheduler'] for name, opt in opt_state_dict.items(): if name == 'LR_Scheduler' or '@GRAD' in name: continue type_dict[name] = convert_dtype(opt.dtype) param_state_dict = {} opt_state_dict = {} dist_param_state_dict = {} dist_weight_state_dict = {} dist_weight_velocity_state_dict = {} dist_bias_state_dict = {} dist_bias_velocity_state_dict = {} for path in os.listdir(checkpoint_dir): path = os.path.join(checkpoint_dir, path) if not os.path.isfile(path): continue basename = os.path.basename(path) name, ext = os.path.splitext(basename) if ext not in ['.pdopt', '.pdparam']: continue if not for_train and ext == '.pdopt': continue if classifier is None and 'dist@' in name and '@rank@' in name: continue tensor = paddle.load(path, return_numpy=True) if dtype: assert dtype in ['float32', 'float16'] tensor = tensor.astype(dtype) else: tensor = tensor.astype(type_dict[name]) if 'dist@' in name and '@rank@' in name: if '.w' in name and 'velocity' not in name: dist_weight_state_dict[name] = tensor elif '.w' in name and 'velocity' in name: dist_weight_velocity_state_dict[name] = tensor elif '.b' in name and 'velocity' not in name: dist_bias_state_dict[name] = tensor elif '.b' in name and 'velocity' in name: dist_bias_velocity_state_dict[name] = tensor else: if ext == '.pdparam': param_state_dict[name] = tensor else: opt_state_dict[name] = tensor if classifier is not None and for_train: meta_file = os.path.join(checkpoint_dir, 'meta.json') if not os.path.exists(meta_file): logging.error( "Please make sure the checkpoint dir {} exists, and " "parameters in that dir are validating.".format( checkpoint_dir)) exit() with open(meta_file, 'r') as handle: extra_info = json.load(handle) # Preporcess distributed parameters. pretrain_world_size = extra_info['pretrain_world_size'] assert pretrain_world_size > 0 embedding_size = extra_info['embedding_size'] assert embedding_size == self.embedding_size num_classes = extra_info['num_classes'] assert num_classes == self.num_classes logging.info( "Parameters for pre-training: pretrain_world_size ({}), " "embedding_size ({}), and num_classes ({}).".format( pretrain_world_size, embedding_size, num_classes)) logging.info("Parameters for inference or fine-tuning: " "world_size ({}).".format(self.world_size)) rank_str = '%05d' % self.rank dist_weight_state_dict = rearrange_weight(dist_weight_state_dict, pretrain_world_size, self.world_size) dist_bias_state_dict = rearrange_weight(dist_bias_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_state_dict.items(): if rank_str in name: dist_param_state_dict[name] = value for name, value in dist_bias_state_dict.items(): if rank_str in name: dist_param_state_dict[name] = value if for_train: dist_weight_velocity_state_dict = rearrange_weight( dist_weight_velocity_state_dict, pretrain_world_size, self.world_size) dist_bias_velocity_state_dict = rearrange_weight( dist_bias_velocity_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_velocity_state_dict.items(): if rank_str in name: opt_state_dict[name] = value for name, value in dist_bias_velocity_state_dict.items(): if rank_str in name: opt_state_dict[name] = value def map_actual_param_name(state_dict, load_state_dict): for name, param in state_dict.items(): state_dict[name] = load_state_dict[param.name] return state_dict logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir)) param_state_dict = map_actual_param_name(backbone.state_dict(), param_state_dict) backbone.set_state_dict(param_state_dict) if classifier is not None: dist_param_state_dict = map_actual_param_name( classifier.state_dict(), dist_param_state_dict) classifier.set_state_dict(dist_param_state_dict) if for_train: assert optimizer is not None optimizer.set_state_dict(opt_state_dict) if classifier is not None and for_train: return extra_info else: return {}
def load(self, program, for_train=True, dtype=None): assert os.path.exists(self.checkpoint_dir) checkpoint_dir = os.path.abspath(self.checkpoint_dir) param_state_dict = program.state_dict(mode='param') opt_state_dict = program.state_dict(mode='opt') type_dict = {} shape_dict = {} for name, param in param_state_dict.items(): type_dict[name] = convert_dtype(param._dtype()) shape_dict[name] = param.shape() for name, opt in opt_state_dict.items(): type_dict[name] = convert_dtype(opt._dtype()) shape_dict[name] = opt.shape() state_dict = {} dist_weight_state_dict = {} dist_weight_velocity_state_dict = {} dist_bias_state_dict = {} dist_bias_velocity_state_dict = {} for path in os.listdir(checkpoint_dir): path = os.path.join(checkpoint_dir, path) if not os.path.isfile(path): continue basename = os.path.basename(path) name, ext = os.path.splitext(basename) if ext not in ['.pdopt', '.pdparam']: continue if not for_train and ext == '.pdopt': continue if name not in type_dict: continue tensor = paddle.load(path, return_numpy=True) if dtype: assert dtype in ['float32', 'float16'] tensor = tensor.astype(dtype) elif name in type_dict: tensor = tensor.astype(type_dict[name]) else: pass if list(shape_dict[name]) != list(tensor.shape): # for prelu NHWC[1, 1, 1, C] and NCHW [1, C, 1, 1] expect_shape = list(shape_dict[name]) actual_shape = list(tensor.shape) if len(expect_shape) == len(actual_shape) and \ expect_shape[0] == actual_shape[0] and expect_shape[0] == 1 and \ expect_shape[2] == actual_shape[2] and expect_shape[2] == 1 and \ expect_shape[1] == actual_shape[3]: if actual_shape[3] != 1: tensor = tensor.transpose([0, 3, 1, 2]) elif actual_shape[1] != 1: tensor = tensor.transpose([0, 2, 3, 1]) if 'dist@' in name and '@rank@' in name: if '.w' in name and 'velocity' not in name: dist_weight_state_dict[name] = tensor elif '.w' in name and 'velocity' in name: dist_weight_velocity_state_dict[name] = tensor elif '.b' in name and 'velocity' not in name: dist_bias_state_dict[name] = tensor elif '.b' in name and 'velocity' in name: dist_bias_velocity_state_dict[name] = tensor else: state_dict[name] = tensor if for_train: meta_file = os.path.join(checkpoint_dir, 'meta.json') if not os.path.exists(meta_file): logging.error( "Please make sure the checkpoint dir {} exists, and " "parameters in that dir are validating.".format( checkpoint_dir)) exit() with open(meta_file, 'r') as handle: extra_info = json.load(handle) # Preporcess distributed parameters. pretrain_world_size = extra_info['pretrain_world_size'] assert pretrain_world_size > 0 embedding_size = extra_info['embedding_size'] assert embedding_size == self.embedding_size num_classes = extra_info['num_classes'] assert num_classes == self.num_classes logging.info( "Parameters for pre-training: pretrain_world_size ({}), " "embedding_size ({}), and num_classes ({}).".format( pretrain_world_size, embedding_size, num_classes)) logging.info("Parameters for inference or fine-tuning: " "world_size ({}).".format(self.world_size)) rank_str = '%05d' % self.rank dist_weight_state_dict = rearrange_weight(dist_weight_state_dict, pretrain_world_size, self.world_size) dist_bias_state_dict = rearrange_weight(dist_bias_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_state_dict.items(): if rank_str in name: state_dict[name] = value for name, value in dist_bias_state_dict.items(): if rank_str in name: state_dict[name] = value if for_train: dist_weight_velocity_state_dict = rearrange_weight( dist_weight_velocity_state_dict, pretrain_world_size, self.world_size) dist_bias_velocity_state_dict = rearrange_weight( dist_bias_velocity_state_dict, pretrain_world_size, self.world_size) for name, value in dist_weight_velocity_state_dict.items(): if rank_str in name: state_dict[name] = value for name, value in dist_bias_velocity_state_dict.items(): if rank_str in name: state_dict[name] = value program.set_state_dict(state_dict) logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir)) if for_train: return extra_info else: return {}