def sequence_mask(seq_hidden, mask, mode='zero'): """ Args: seq_hidden (Tensor): NULL mask (Tensor): 1 for un-mask tokens, and 0 for mask tokens. mode (str): zero/-inf/+inf Returns: TODO Raises: NULL """ dtype = seq_hidden.dtype while len(mask.shape) < len(seq_hidden.shape): mask = mask.unsqueeze([-1]) mask = mask.cast(dtype=seq_hidden.dtype) masked = paddle.multiply(seq_hidden, mask) if mode == 'zero': return masked if mode == '-inf': scale_size = +1e5 elif mode == '+inf': scale_size = -1e5 else: raise ValueError( f'mask mode setting error. expect zero/-inf/+inf, but got {mode}') add_mask = paddle.scale(mask - 1, scale=scale_size) masked = paddle.add(masked, add_mask) return masked
def entropy(self): """Shannon entropy in nats. Returns: Tensor: Shannon entropy of Categorical distribution. The data type is float32. Examples: .. code-block:: python import paddle from paddle.distribution import Categorical paddle.seed(100) # on CPU device x = paddle.rand([6]) print(x) # [0.5535528 0.20714243 0.01162981 # 0.51577556 0.36369765 0.2609165 ] cat = Categorical(x) cat.entropy() # [1.77528] """ name = self.name + '_entropy' logits = self.logits - \ paddle.max(self.logits, axis=-1, keepdim=True) e_logits = ops.exp(logits) z = paddle.sum(e_logits, axis=-1, keepdim=True) prob = e_logits / z neg_entropy = paddle.sum(prob * (logits - paddle.log(z)), axis=-1) entropy = paddle.scale(neg_entropy, scale=-1.0, name=name) return entropy
def paddle_scale_tensor(name: str, x, scale, bias, attrs: dict, data_type): import paddle as paddle paddle.enable_static() with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): node_x = paddle.static.data(name='x', shape=x.shape, dtype=data_type) node_scale = paddle.static.data(name='scale', shape=[1], dtype='float32') out = paddle.scale(x=node_x, scale=node_scale, bias=bias, bias_after_scale=attrs['bias_after_scale']) #FuzzyTest only support FP32 now, so cast result to fp32 out = paddle.cast(out, "float32") cpu = paddle.static.cpu_places(1) exe = paddle.static.Executor(cpu[0]) # startup program will call initializer to initialize the parameters. exe.run(paddle.static.default_startup_program()) outs = exe.run(feed={'x': x, 'scale': scale}, fetch_list=[out]) saveModel(name, exe, feedkeys=['x', 'scale'], fetchlist=[out], inputs=[x, np.array([scale]).astype('float32')], outputs=[outs[0]], target_dir=sys.argv[1]) return outs[0]
def forward(self, batch_size, user_sparse_inputs, mov_sparse_inputs, label_input): user_sparse_embed_seq = [] for s_input in user_sparse_inputs: emb = self.embedding(s_input) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) user_sparse_embed_seq.append(emb) mov_sparse_embed_seq = [] for s_input in mov_sparse_inputs: s_input = paddle.reshape(s_input, shape=[batch_size, -1]) emb = self.embedding(s_input) emb = paddle.sum(emb, axis=1) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) mov_sparse_embed_seq.append(emb) features = paddle.concat(user_sparse_embed_seq + mov_sparse_embed_seq, axis=1) for n_layer in self._layers: features = n_layer(features) predict = paddle.scale(features, scale=5) return predict
def forward(self, x, condition=None): """Forward pass of ``ResidualNet``. Parameters ---------- x : Tensor [shape=(B, C, T)] The input. condition : Tensor, optional [shape=(B, C_cond, T)] The condition, it has been upsampled in time steps, so it has the same time steps as the input does. Defaults to None. Returns -------- Tensor [shape=(B, C, T)] The output. """ for i, func in enumerate(self): x, skip = func(x, condition) if i == 0: skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, math.sqrt(0.5)) return skip_connections
def add_input(self, x, condition=None): """Take a step input and return a step output. This method works similarily with ``forward`` but in a ``step-in-step-out`` fashion. Parameters ---------- x : Tensor [shape=(B, C)] Input for a step. condition : Tensor, optional [shape=(B, C_cond)] Condition for a step. Defaults to None. Returns ---------- Tensor [shape=(B, C)] The skip connection for a step. This output is accumulated with that of other ResidualBlocks. """ for i, func in enumerate(self): x, skip = func.add_input(x, condition) if i == 0: skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, math.sqrt(0.5)) return skip_connections
def forward(self, words, feats=None): """Forward network""" # batch_size, seq_len = words.shape # get embedding words, x = self.embed(words, feats) mask = layers.logical_and(words != self.args.pad_index, words != self.args.eos_index) # apply MLPs to the BiLSTM output states arc_h = self.mlp_arc_h(x) arc_d = self.mlp_arc_d(x) rel_h = self.mlp_rel_h(x) rel_d = self.mlp_rel_d(x) # get arc and rel scores from the bilinear attention # [batch_size, seq_len, seq_len] s_arc = self.arc_attn(arc_d, arc_h) # [batch_size, seq_len, seq_len, n_rels] s_rel = layers.transpose(self.rel_attn(rel_d, rel_h), perm=(0, 2, 3, 1)) # set the scores that exceed the length of each sentence to -1e5 s_arc_mask = paddle.unsqueeze(mask, 1) s_arc = s_arc * s_arc_mask + paddle.scale(paddle.cast( s_arc_mask, 'int32'), scale=1e5, bias=-1, bias_after_scale=False) return s_arc, s_rel, words
def forward(self, words, wp): words, x = self.embed(words, wp) mask = paddle.logical_and(words != self.pad_index, words != self.eos_index) arc_h = self.mlp_arc_h(x) arc_d = self.mlp_arc_d(x) rel_h = self.mlp_rel_h(x) rel_d = self.mlp_rel_d(x) # Get arc and rel scores from the bilinear attention # Shape: (batch_size, seq_len, seq_len) s_arc = self.arc_attn(arc_d, arc_h) # Shape: (batch_size, seq_len, seq_len, n_rels) s_rel = paddle.transpose(self.rel_attn(rel_d, rel_h), perm=[0, 2, 3, 1]) # Set the scores that exceed the length of each sentence to -1e5 s_arc_mask = paddle.unsqueeze(mask, 1) s_arc = s_arc * s_arc_mask + paddle.scale( paddle.cast(s_arc_mask, 'int32'), scale=1e5, bias=-1, bias_after_scale=False) mask = paddle.cast(paddle.logical_and( paddle.logical_and(words != self.pad_index, words != self.bos_index), words != self.eos_index, ), 'int32') arc_preds = paddle.argmax(s_arc, axis=-1) rel_preds = paddle.argmax(s_rel, axis=-1) return arc_preds, rel_preds, s_arc, mask
def forward(self, batch_size, user_sparse_inputs, mov_sparse_inputs, label_input): user_sparse_embed_seq = [] for s_input in user_sparse_inputs: emb = self.embedding(s_input) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) user_sparse_embed_seq.append(emb) mov_sparse_embed_seq = [] for s_input in mov_sparse_inputs: s_input = paddle.reshape(s_input, shape=[batch_size, -1]) emb = self.embedding(s_input) emb = paddle.sum(emb, axis=1) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) mov_sparse_embed_seq.append(emb) user_features = paddle.concat(user_sparse_embed_seq, axis=1) mov_features = paddle.concat(mov_sparse_embed_seq, axis=1) for n_layer in self._user_layers: user_features = n_layer(user_features) for n_layer in self._movie_layers: mov_features = n_layer(mov_features) sim = F.cosine_similarity(user_features, mov_features, axis=1).reshape([-1, 1]) predict = paddle.scale(sim, scale=5) return predict
def softmax_with_cross_entropy(self, shard_logit, shard_one_hot): shard_max = paddle.max(shard_logit, axis=1, keepdim=True) global_max = shard_max paddle.distributed.all_reduce(global_max, op=paddle.distributed.ReduceOp.MAX) shard_logit_new = paddle.subtract(shard_logit, global_max) shard_exp = paddle.exp(shard_logit_new) shard_demon = paddle.sum(shard_exp, axis=1, keepdim=True) global_demon = shard_demon paddle.distributed.all_reduce(global_demon, op=paddle.distributed.ReduceOp.SUM) global_log_demon = paddle.log(global_demon) shard_log_prob = shard_logit_new - global_log_demon shard_prob = paddle.exp(shard_log_prob) target_log_prob = paddle.min(shard_log_prob * shard_one_hot, axis=1, keepdim=True) shard_loss = paddle.scale(target_log_prob, scale=-1.0) #TODO paddle.distributed.reducescatter not found global_loss = paddle.fluid.layers.collective._c_reducescatter( shard_loss, nranks=self.nranks, use_calc_stream=True) return global_loss, shard_prob
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0, training=True): r"""Scaled dot product attention with masking. Assume that q, k, v all have the same leading dimensions (denoted as * in descriptions below). Dropout is applied to attention weights before weighted sum of values. Parameters ----------- q : Tensor [shape=(\*, T_q, d)] the query tensor. k : Tensor [shape=(\*, T_k, d)] the key tensor. v : Tensor [shape=(\*, T_k, d_v)] the value tensor. mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional the mask tensor, zeros correspond to paddings. Defaults to None. Returns ---------- out : Tensor [shape=(\*, T_q, d_v)] the context vector. attn_weights : Tensor [shape=(\*, T_q, T_k)] the attention weights. """ d = q.shape[-1] # we only support imperative execution qk = paddle.matmul(q, k, transpose_y=True) scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d)) if mask is not None: scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here attn_weights = F.softmax(scaled_logit, axis=-1) attn_weights = F.dropout(attn_weights, dropout, training=training) out = paddle.matmul(attn_weights, v) return out, attn_weights
def forward(self, query, key): q = self.q_proj(query) k = self.k_proj(key) q = paddle.reshape(q, shape=[0, 0, self.num_heads, self.embed_dim]) k = paddle.reshape(k, shape=[0, 0, self.num_heads, self.embed_dim]) q = paddle.transpose(q, perm=[0, 2, 1, 3]) k = paddle.transpose(k, perm=[0, 2, 1, 3]) scores = paddle.matmul(q, k, transpose_y=True) scores = paddle.scale(scores, scale=self.scale_value) return scores
def forward(self, usr_var, mov_var): # 计算用户特征和电影特征 user_features = self.get_usr_feat(usr_var) mov_features = self.get_mov_feat(mov_var) #使用余弦相似度算子,计算用户和电影的相似程度 sim = F.cosine_similarity(user_features, mov_features, axis=1).reshape([-1, 1]) # 将相似度扩大范围到和电影评分相同数据范围 res = paddle.scale(sim, scale=5) return user_features, mov_features, res
def forward(self, x): """ forward """ scale = self.config["scale"] if self.config['isTensor']: scale = paddle.to_tensor(scale) x = paddle.scale(x, scale=scale, bias=self.config["bias"], bias_after_scale=self.config["bias_after_scale"]) return x
def _margin_softmax(input, label, out_dim, param_attr, margin1, margin2, margin3, scale, sample_ratio): input_norm = paddle.sqrt( paddle.sum(paddle.square(input), axis=1, keepdim=True)) input = paddle.divide(input, input_norm) if param_attr is None: param_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.XavierNormal(fan_in=0.0)) weight = paddle.static.create_parameter( shape=[input.shape[1], out_dim], dtype='float32', name=unique_name.generate('final_fc_w'), attr=param_attr) if sample_ratio < 1.0: # partial fc sample process label, sampled_class_index = class_center_sample( label, out_dim, ratio=sample_ratio, ignore_label=-1) sampled_class_index.stop_gradient = True weight = paddle.gather(weight, sampled_class_index, axis=1) out_dim = paddle.shape(sampled_class_index) weight_norm = paddle.sqrt( paddle.sum(paddle.square(weight), axis=0, keepdim=True)) weight = paddle.divide(weight, weight_norm) cos = paddle.matmul(input, weight) theta = paddle.acos(cos) if margin1 != 1.0: theta = margin1 * theta if margin2 != 0.0: theta = theta + margin2 margin_cos = paddle.cos(theta) if margin3 != 0.0: margin_cos = margin_cos - margin3 one_hot = paddle.nn.functional.one_hot(label, num_classes=out_dim) diff = paddle.multiply(paddle.subtract(margin_cos, cos), one_hot) target_cos = paddle.add(cos, diff) logit = paddle.scale(target_cos, scale=scale) loss, prob = paddle.nn.functional.softmax_with_cross_entropy( logits=logit, label=paddle.reshape(label, (-1, 1)), return_softmax=True) avg_loss = paddle.mean(x=loss) one_hot.stop_gradient = True return avg_loss, prob
def forward(self, src_word, src_pos): src_word_emb = src_word src_word_emb = fluid.layers.cast(src_word_emb, 'float32') src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_pos = paddle.squeeze(src_pos, axis=-1) src_pos_enc = self.emb(src_pos) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc if self.dropout_rate: out = F.dropout( x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") else: out = enc_input return out
def forward(self, usr_var, mov_var): """定义个性化推荐算法的前向计算""" # 计算用户特征和电影特征 user_features = self.get_usr_feature(usr_var) mov_features = self.get_movie_feature(mov_var) # 根据计算的特征计算相似度, reshape方便loss计算 sim = F.common.cosine_similarity(user_features, mov_features).reshape([-1, 1]) # 使用余弦相似度算子,计算用户和电影的相似程度 # sim = F.cosine_similarity(user_features, mov_features, axis=1).reshape([-1, 1]) # 将相似度扩大范围到和电影评分相同数据范围 res = paddle.scale(sim, scale=5) return user_features, mov_features, res
def forward(self, x): B, N, C = x.shape qkv = self.qkv(x) qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)) qkv = qkv.transpose((2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[2] attn = paddle.scale(paddle.matmul(q, k, transpose_y=True), scale=self.scale) attn = F.softmax(attn, axis=-1) attn = self.attn_drop(attn) x = paddle.matmul(attn, v) x = x.transpose((0, 2, 1, 3)) x = x.reshape((B, N, C)) x = self.proj(x) x = self.proj_drop(x) return x
def forward(self, hist_item_seq, hist_cat_seq, target_item, target_cat, label, mask, target_item_seq, target_cat_seq): hist_item_emb = self.hist_item_emb_attr(hist_item_seq) hist_cat_emb = self.hist_cat_emb_attr(hist_cat_seq) target_item_emb = self.target_item_emb_attr(target_item) target_cat_emb = self.target_cat_emb_attr(target_cat) target_item_seq_emb = self.target_item_seq_emb_attr(target_item_seq) target_cat_seq_emb = self.target_cat_seq_emb_attr(target_cat_seq) item_b = self.item_b_attr(target_item) hist_seq_concat = paddle.concat([hist_item_emb, hist_cat_emb], axis=2) target_seq_concat = paddle.concat( [target_item_seq_emb, target_cat_seq_emb], axis=2) target_concat = paddle.concat([target_item_emb, target_cat_emb], axis=1) concat = paddle.concat([ hist_seq_concat, target_seq_concat, hist_seq_concat - target_seq_concat, hist_seq_concat * target_seq_concat ], axis=2) for attlayer in self.attention_layer: concat = attlayer(concat) atten_fc3 = concat + mask atten_fc3 = paddle.transpose(atten_fc3, perm=[0, 2, 1]) atten_fc3 = paddle.scale(atten_fc3, scale=self.firInDim**-0.5) weight = paddle.nn.functional.softmax(atten_fc3) output = paddle.matmul(weight, hist_seq_concat) output = paddle.reshape(output, shape=[0, self.firInDim]) for firLayer in self.con_layer[:1]: concat = firLayer(output) embedding_concat = paddle.concat([concat, target_concat], axis=1) for colayer in self.con_layer[1:]: embedding_concat = colayer(embedding_concat) logit = embedding_concat + item_b return logit
def forward(self, x, mask=None): """ Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape( (B_, N, 3, self.num_heads, C // self.num_heads)).transpose( (2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[ 2] # make torchscript happy (cannot use tensor as tuple) attn = paddle.scale(paddle.matmul(q, k, transpose_y=True), scale=self.scale) relative_position_bias = self.relative_position_bias_table.index_select( self.relative_position_index.flatten()).reshape( (self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.transpose( (2, 0, 1)) # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.reshape((B_ // nW, nW, self.num_heads, N, N)) + mask.unsqueeze(1).unsqueeze(0) attn = attn.reshape((-1, self.num_heads, N, N)) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = paddle.matmul(attn, v) x = x.transpose((0, 2, 1, 3)) x = x.reshape((B_, N, C)) x = self.proj(x) x = self.proj_drop(x) return x
def forward(self, x, condition=None): """Forward pass of the ResidualBlock. Parameters ----------- x : Tensor [shape=(B, C, T)] The input tensor. condition : Tensor, optional [shape(B, C_cond, T)] The condition. It has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None. Returns ----------- residual : Tensor [shape=(B, C, T)] The residual, which is used as the input to the next ResidualBlock. skip_connection : Tensor [shape=(B, C, T)] Tthe skip connection. This output is accumulated with that of other ResidualBlocks. """ h = x # dilated conv h = self.conv(h) # condition if condition is not None: h += self.condition_proj(condition) # gated tanh content, gate = paddle.split(h, 2, axis=1) z = F.sigmoid(gate) * paddle.tanh(content) # projection residual = paddle.scale(z + x, math.sqrt(.5)) skip_connection = z return residual, skip_connection
def add_input(self, x, condition=None): """Take a step input and return a step output. This method works similarily with ``forward`` but in a ``step-in-step-out`` fashion. Parameters ---------- x : Tensor [shape=(B, C)] Input for a step. condition : Tensor, optional [shape=(B, C_cond)] Condition for a step. Defaults to None. Returns ---------- residual : Tensor [shape=(B, C)] The residual for a step, which is used as the input to the next layer of ResidualBlock. skip_connection : Tensor [shape=(B, C)] T he skip connection for a step. This output is accumulated with that of other ResidualBlocks. """ h = x # dilated conv h = self.conv.add_input(h) # condition if condition is not None: h += self.condition_proj.add_input(condition) # gated tanh content, gate = paddle.split(h, 2, axis=1) z = F.sigmoid(gate) * paddle.tanh(content) # projection residual = paddle.scale(z + x, math.sqrt(0.5)) skip_connection = z return residual, skip_connection
def forward(self, src_ids, position_ids, sent_ids, input_mask, mask_pos): emb_out = self.word_emb(src_ids) position_embs_out = self.position_emb(position_ids) emb_out = emb_out + position_embs_out sent_emb_out = self.sent_emb(sent_ids) emb_out = emb_out + sent_emb_out emb_out = self.enc_pre_process_layer(emb_out) if self._dtype == "float16": input_mask = paddle.cast(x=input_mask, dtype=self._dtype) else: input_mask = paddle.cast(x=input_mask, dtype='float32') self_attn_mask = paddle.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = paddle.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = self._enc_out_layer(enc_input=emb_out, attn_bias=n_head_self_attn_mask) mask_pos = paddle.cast(x=mask_pos, dtype='int32') reshaped_emb_out = paddle.reshape(x=self._enc_out, shape=[-1, self._emb_size]) mask_feat = paddle.gather(x=reshaped_emb_out, index=mask_pos, axis=0) mask_trans_feat_out = self.mask_trans_feat(mask_feat) mask_trans_feat_out = self.mask_trans_act(mask_trans_feat_out) mask_trans_feat_out = self.mask_post_process_layer( out=mask_trans_feat_out) for name, param in self.named_parameters(): if name == "word_emb.weight": y_tensor = param break fc_out = paddle.matmul(x=mask_trans_feat_out, y=y_tensor, transpose_y=True) fc_out += self.mask_lm_out_bias return fc_out
def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): token_emb_out = self.word_embedding_layer(token_ids) type_emb_out = self.sent_embedding_layer(type_ids) pos_emb_out = self.pos_embedding_layer(pos_ids) emb_out = token_emb_out + type_emb_out + pos_emb_out # auxiliary memory embeddings if aux_emb is not None: emb_out = paddle.concat([aux_emb, emb_out], axis=1) emb_out = self.dropout_layer(emb_out) # generate n-head self-attention mask self_attn_mask = input_mask self_attn_mask = paddle.scale( x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = paddle.stack( x=[self_attn_mask] * self.n_head, axis=1) n_head_self_attn_mask.stop_gradient = True return emb_out, n_head_self_attn_mask
def local_response_norm(x, size, alpha=1e-4, beta=0.75, k=1., data_format="NCHW", name=None): r""" Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions. For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_ The formula is as follows: .. math:: Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta} In the above equation: - :math:`size` : The number of channels to sum over. - :math:`k` : The offset (avoid being divided by 0). - :math:`\\alpha` : The scaling parameter. - :math:`\\beta` : The exponent parameter. Args: x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32. size (int): The number of channels to sum over. alpha (float, optional): The scaling parameter, positive. Default:1e-4 beta (float, optional): The exponent, positive. Default:0.75 k (float, optional): An offset, positive. Default: 1.0 data_format (str, optional): Specify the data format of the input, and the data format of the output will be consistent with that of the input. An optional string from: If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`, the data is stored in the order of: `[batch_size, input_channels, feature_length]`. If x is 4-D Tensor, the string could be `"NCHW"`, `"NHWC"`. When it is `"NCHW"`, the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`. If x is 5-D Tensor, the string could be `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`, the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: A tensor storing the transformation result with the same shape and data type as input. Examples: .. code-block:: python import paddle x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32") y = paddle.nn.functional.local_response_norm(x, size=5) print(y.shape) # [3, 3, 112, 112] """ if not in_dygraph_mode(): check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm') if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']: raise ValueError( "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \ "but got {}".format(data_format)) sizes = x.shape dim = len(sizes) if dim < 3: raise ValueError( 'Expected 3D or higher dimensionality input, but got {} dimensions' .format(dim)) for i, sz in enumerate(sizes): if not sz > 0: raise ValueError("Expected every dim's size to be larger than 0, " "but the size of the {}-th dim is {}".format( i, sz)) channel_last = True if data_format[-1] == "C" else False from functools import reduce sum_sizes = reduce(lambda x, y: x * y, sizes[1:]) div = paddle.unsqueeze(paddle.multiply(x, x), axis=1) if not channel_last: pad4d_shape = [0, 0, size // 2, (size - 1) // 2] pool2d_shape = (size, 1) reshape_shape = [ sizes[0], 1, sizes[1], sizes[2], int(sum_sizes / (sizes[1] * sizes[2])) ] pad5d_shape = [0, 0, 0, 0, size // 2, (size - 1) // 2] pool3d_shape = (size, 1, 1) else: pad4d_shape = [size // 2, (size - 1) // 2, 0, 0] pool2d_shape = (1, size) reshape_shape = [ sizes[0], 1, sizes[1], int(sum_sizes / (sizes[1] * sizes[-1])), sizes[-1] ] pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0] pool3d_shape = (1, 1, size) if dim == 3: div = paddle.nn.functional.pad(div, pad=pad4d_shape) div = paddle.nn.functional.avg_pool2d(div, kernel_size=pool2d_shape, stride=1) div = paddle.squeeze(div, axis=1) else: div = paddle.reshape(div, shape=reshape_shape) div = paddle.nn.functional.pad(div, pad=pad5d_shape, data_format='NCDHW') div = paddle.nn.functional.avg_pool3d(div, kernel_size=pool3d_shape, stride=1) div = paddle.reshape(paddle.squeeze(div, axis=1), sizes) div = paddle.scale(div, scale=alpha, bias=k) div = paddle.pow(div, beta) res = paddle.divide(x, div, name=name) return res
def margin_softmax_classify(self, x, label, margin1=1.0, margin2=0.5, margin3=0.0, logit_scale=64, param_attr=None): ''' reference: ArcFace. https://arxiv.org/abs/1801.07698 ''' flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1) weight, bias = self.create_parameter(dtype=x.dtype, in_dim=flatten_dim, param_attr=param_attr, use_bias=False) # normalize x x_l2 = paddle.sqrt(paddle.sum(paddle.square(x), axis=1, keepdim=True)) norm_x = paddle.divide(x, x_l2) norm_x_list = [] paddle.distributed.all_gather(norm_x_list, norm_x) norm_x_all = paddle.concat(norm_x_list, axis=0) label_list = [] paddle.distributed.all_gather(label_list, label) label_all = paddle.concat(label_list, axis=0) label_all.stop_gradient = True label_all = paddle.reshape(label_all, (-1, 1)) shard_label = paddle.shard_index(label_all, index_num=self.nclasses, nshards=self.nranks, shard_id=self.rank_id, ignore_value=-1) shard_label = paddle.reshape(shard_label, (-1, )) # TODO check necessary shard_label.stop_gradient = True if self.sample_ratio < 1.0: # partial fc sample process shard_label, sampled_class_index = class_center_sample( shard_label, self.shard_dim, ratio=self.sample_ratio, ignore_label=-1) sampled_class_index.stop_gradient = True weight = paddle.gather(weight, sampled_class_index, axis=1) shard_dim = paddle.shape(sampled_class_index) else: shard_dim = self.shard_dim # normalize weight weight_l2 = paddle.sqrt( paddle.sum(paddle.square(weight), axis=0, keepdim=True)) norm_weight = paddle.divide(weight, weight_l2) shard_cos = paddle.matmul(norm_x_all, norm_weight) theta = paddle.acos(shard_cos) if margin1 != 1.0: theta = margin1 * theta if margin2 != 0.0: theta = theta + margin2 margin_cos = paddle.cos(theta) if margin3 != 0.0: margin_cos = margin_cos - margin3 shard_one_hot = paddle.nn.functional.one_hot(shard_label, num_classes=shard_dim) # TODO check necessary shard_one_hot.stop_gradient = True diff = paddle.multiply(paddle.subtract(margin_cos, shard_cos), shard_one_hot) shard_target_cos = paddle.add(shard_cos, diff) shard_logit = paddle.scale(shard_target_cos, scale=logit_scale) global_loss, shard_prob = self.softmax_with_cross_entropy( shard_logit, shard_one_hot) avg_loss = paddle.mean(global_loss) avg_loss._set_info('shard_logit', shard_logit) avg_loss._set_info('shard_prob', shard_prob) avg_loss._set_info('shard_label', shard_label) avg_loss._set_info('shard_dim', shard_dim) return avg_loss
def _executed_api(self, x, scale=1.0, bias=0.0): return paddle.scale(x, scale, bias)
def scale2(inputs): return paddle.scale(inputs, scale=3, bias=2.1, act="gelu")
def scale1(inputs): return paddle.scale(inputs, scale=2.0, bias=1.0)
def parallel_self_attention(input, hidden_size, num_attention_heads, mp_rank, mp_nranks, dtype="float32", ring_id=0): assert hidden_size % mp_nranks == 0 hidden_size_per_part = hidden_size // mp_nranks assert hidden_size % num_attention_head == 0 hidden_size_per_head = hidden_size // num_attention_heads assert num_attention_heads % mp_nranks == 0 num_attention_head_per_part = num_attention_heads // mp_nranks query_key_value = column_parallel_linear(input, hidden_size, hidden_size * 3, use_bias=False, gather_out=False, mp_rank=mp_rank, mp_nranks=mp_nranks, dtype=dtype, ring_id=ring_id) # [sq, b, (np * hn * 3)] -> [sq, b, np, 3 * bn] new_shape = query_key_value.shape()[:-1] + (num_attention_head_per_part, 3 * hidden_size_per_part) query_key_value = paddle.reshape(query_key_value, new_shape) # [sq, b, np, 3*bn] -> 3 [sq, b, np, bn] (query, key, value) = paddle.split(query_key_value, 3) # [b, np, sq, sk] output_size = (query.shape[1], query.shape[2], query.shape[0], key.shape[0]) # [sq, b, np, bn] -> [sq, b * np, hn] query = paddle.reshape( query, (output_size[2], output_size[0] * output_size[1], -1)) key = paddle.reshape(key, (output_size[3], output_size[0] * output_size[1], -1)) result = paddle.bmm(paddle.transpose(query, [1, 0, 2]), paddle.transpose(query, [1, 2, 0])) result = paddle.scale(result, 1.0 / norm_factor) scores = paddle.reshape(result, output_size) # [b, np, sq, hn] output_size = (value.shape[1], value.shape[2], query.shape[0], value.shape[3]) # [sk, b * np, hn] value = paddle.reshape( value, [value.shape[0], output_size[0] * output_size[1], -1]) # [b * np, sq, sk] attention_probs = paddle.reshape( attention_probs, [output_size[0] * output_size[1], output_size[2], -1]) [b * np, sq, bn] context = paddle.bmm(attention_probs, paddle.transpose(value, [1, 0, 2])) # [b, np, sq, hn] context = paddle.reshape(context, output_size) #[b, np, sq, hn] -> [sq, b, np, hn] context = paddle.transpose(context, [2, 0, 1, 3]) # [sq, b, np, hn] -> [sq, b, hp] new_shape = context.shape[:-2] + (hidden_size_per_part, ) context = paddle.reshape(context, new_shape) # Output: [sq, b, h] output, bias = row_parallel_linear()