Beispiel #1
0
def sequence_mask(seq_hidden, mask, mode='zero'):
    """

    Args:
        seq_hidden (Tensor): NULL
        mask (Tensor): 1 for un-mask tokens, and 0 for mask tokens.
        mode (str): zero/-inf/+inf

    Returns: TODO

    Raises: NULL
    """
    dtype = seq_hidden.dtype

    while len(mask.shape) < len(seq_hidden.shape):
        mask = mask.unsqueeze([-1])

    mask = mask.cast(dtype=seq_hidden.dtype)
    masked = paddle.multiply(seq_hidden, mask)
    if mode == 'zero':
        return masked

    if mode == '-inf':
        scale_size = +1e5
    elif mode == '+inf':
        scale_size = -1e5
    else:
        raise ValueError(
            f'mask mode setting error. expect zero/-inf/+inf, but got {mode}')

    add_mask = paddle.scale(mask - 1, scale=scale_size)
    masked = paddle.add(masked, add_mask)
    return masked
Beispiel #2
0
    def entropy(self):
        """Shannon entropy in nats.

        Returns:
            Tensor: Shannon entropy of Categorical distribution. The data type is float32.

        Examples:
            .. code-block:: python

                import paddle
                from paddle.distribution import Categorical

                paddle.seed(100) # on CPU device
                x = paddle.rand([6])
                print(x)
                # [0.5535528  0.20714243 0.01162981
                #  0.51577556 0.36369765 0.2609165 ]

                cat = Categorical(x)

                cat.entropy()
                # [1.77528]

        """
        name = self.name + '_entropy'
        logits = self.logits - \
            paddle.max(self.logits, axis=-1, keepdim=True)
        e_logits = ops.exp(logits)
        z = paddle.sum(e_logits, axis=-1, keepdim=True)
        prob = e_logits / z

        neg_entropy = paddle.sum(prob * (logits - paddle.log(z)), axis=-1)
        entropy = paddle.scale(neg_entropy, scale=-1.0, name=name)
        return entropy
Beispiel #3
0
def paddle_scale_tensor(name: str, x, scale, bias, attrs: dict, data_type):
    import paddle as paddle
    paddle.enable_static()

    with paddle.static.program_guard(paddle.static.Program(),
                                     paddle.static.Program()):
        node_x = paddle.static.data(name='x', shape=x.shape, dtype=data_type)
        node_scale = paddle.static.data(name='scale',
                                        shape=[1],
                                        dtype='float32')
        out = paddle.scale(x=node_x,
                           scale=node_scale,
                           bias=bias,
                           bias_after_scale=attrs['bias_after_scale'])
        #FuzzyTest only support FP32 now, so cast result to fp32
        out = paddle.cast(out, "float32")
        cpu = paddle.static.cpu_places(1)
        exe = paddle.static.Executor(cpu[0])
        # startup program will call initializer to initialize the parameters.
        exe.run(paddle.static.default_startup_program())

        outs = exe.run(feed={'x': x, 'scale': scale}, fetch_list=[out])

        saveModel(name,
                  exe,
                  feedkeys=['x', 'scale'],
                  fetchlist=[out],
                  inputs=[x, np.array([scale]).astype('float32')],
                  outputs=[outs[0]],
                  target_dir=sys.argv[1])

    return outs[0]
Beispiel #4
0
    def forward(self, batch_size, user_sparse_inputs, mov_sparse_inputs,
                label_input):

        user_sparse_embed_seq = []
        for s_input in user_sparse_inputs:
            emb = self.embedding(s_input)
            emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
            user_sparse_embed_seq.append(emb)

        mov_sparse_embed_seq = []
        for s_input in mov_sparse_inputs:
            s_input = paddle.reshape(s_input, shape=[batch_size, -1])
            emb = self.embedding(s_input)
            emb = paddle.sum(emb, axis=1)
            emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
            mov_sparse_embed_seq.append(emb)

        features = paddle.concat(user_sparse_embed_seq + mov_sparse_embed_seq,
                                 axis=1)

        for n_layer in self._layers:
            features = n_layer(features)

        predict = paddle.scale(features, scale=5)

        return predict
Beispiel #5
0
    def forward(self, x, condition=None):
        """Forward pass of ``ResidualNet``.
        
        Parameters
        ----------
        x : Tensor [shape=(B, C, T)]
            The input. 
            
        condition : Tensor, optional [shape=(B, C_cond, T)]
            The condition, it has been upsampled in time steps, so it has the 
            same time steps as the input does. Defaults to None.

        Returns
        --------
        Tensor [shape=(B, C, T)]
            The output.
        """
        for i, func in enumerate(self):
            x, skip = func(x, condition)
            if i == 0:
                skip_connections = skip
            else:
                skip_connections = paddle.scale(skip_connections + skip,
                                                math.sqrt(0.5))
        return skip_connections
Beispiel #6
0
    def add_input(self, x, condition=None):
        """Take a step input and return a step output. 
        
        This method works similarily with ``forward`` but in a 
        ``step-in-step-out`` fashion.

        Parameters
        ----------
        x : Tensor [shape=(B, C)]
            Input for a step.
            
        condition : Tensor, optional [shape=(B, C_cond)]
            Condition for a step. Defaults to None.

        Returns
        ----------            
        Tensor [shape=(B, C)]
            The skip connection for a step. This output is accumulated with 
            that of other ResidualBlocks. 
        """
        for i, func in enumerate(self):
            x, skip = func.add_input(x, condition)
            if i == 0:
                skip_connections = skip
            else:
                skip_connections = paddle.scale(skip_connections + skip,
                                                math.sqrt(0.5))
        return skip_connections
Beispiel #7
0
    def forward(self, words, feats=None):
        """Forward network"""
        # batch_size, seq_len = words.shape
        # get embedding
        words, x = self.embed(words, feats)
        mask = layers.logical_and(words != self.args.pad_index,
                                  words != self.args.eos_index)

        # apply MLPs to the BiLSTM output states
        arc_h = self.mlp_arc_h(x)
        arc_d = self.mlp_arc_d(x)
        rel_h = self.mlp_rel_h(x)
        rel_d = self.mlp_rel_d(x)

        # get arc and rel scores from the bilinear attention
        # [batch_size, seq_len, seq_len]
        s_arc = self.arc_attn(arc_d, arc_h)
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = layers.transpose(self.rel_attn(rel_d, rel_h),
                                 perm=(0, 2, 3, 1))
        # set the scores that exceed the length of each sentence to -1e5
        s_arc_mask = paddle.unsqueeze(mask, 1)
        s_arc = s_arc * s_arc_mask + paddle.scale(paddle.cast(
            s_arc_mask, 'int32'),
                                                  scale=1e5,
                                                  bias=-1,
                                                  bias_after_scale=False)

        return s_arc, s_rel, words
Beispiel #8
0
    def forward(self, words, wp):

        words, x = self.embed(words, wp)
        mask = paddle.logical_and(words != self.pad_index, words != self.eos_index)

        arc_h = self.mlp_arc_h(x)
        arc_d = self.mlp_arc_d(x)
        rel_h = self.mlp_rel_h(x)
        rel_d = self.mlp_rel_d(x)

        # Get arc and rel scores from the bilinear attention
        # Shape: (batch_size, seq_len, seq_len)
        s_arc = self.arc_attn(arc_d, arc_h)
        # Shape: (batch_size, seq_len, seq_len, n_rels)
        s_rel = paddle.transpose(self.rel_attn(rel_d, rel_h), perm=[0, 2, 3, 1])
        # Set the scores that exceed the length of each sentence to -1e5
        s_arc_mask = paddle.unsqueeze(mask, 1)
        s_arc = s_arc * s_arc_mask + paddle.scale(
            paddle.cast(s_arc_mask, 'int32'), scale=1e5, bias=-1, bias_after_scale=False)

        mask = paddle.cast(paddle.logical_and(
            paddle.logical_and(words != self.pad_index, words != self.bos_index),
            words != self.eos_index,
            ), 'int32')
        arc_preds = paddle.argmax(s_arc, axis=-1)
        rel_preds = paddle.argmax(s_rel, axis=-1)    
        return arc_preds, rel_preds, s_arc, mask
Beispiel #9
0
    def forward(self, batch_size, user_sparse_inputs, mov_sparse_inputs,
                label_input):

        user_sparse_embed_seq = []
        for s_input in user_sparse_inputs:
            emb = self.embedding(s_input)
            emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
            user_sparse_embed_seq.append(emb)

        mov_sparse_embed_seq = []
        for s_input in mov_sparse_inputs:
            s_input = paddle.reshape(s_input, shape=[batch_size, -1])
            emb = self.embedding(s_input)
            emb = paddle.sum(emb, axis=1)
            emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
            mov_sparse_embed_seq.append(emb)

        user_features = paddle.concat(user_sparse_embed_seq, axis=1)
        mov_features = paddle.concat(mov_sparse_embed_seq, axis=1)

        for n_layer in self._user_layers:
            user_features = n_layer(user_features)

        for n_layer in self._movie_layers:
            mov_features = n_layer(mov_features)

        sim = F.cosine_similarity(user_features, mov_features,
                                  axis=1).reshape([-1, 1])
        predict = paddle.scale(sim, scale=5)

        return predict
Beispiel #10
0
    def softmax_with_cross_entropy(self, shard_logit, shard_one_hot):
        shard_max = paddle.max(shard_logit, axis=1, keepdim=True)
        global_max = shard_max
        paddle.distributed.all_reduce(global_max,
                                      op=paddle.distributed.ReduceOp.MAX)
        shard_logit_new = paddle.subtract(shard_logit, global_max)

        shard_exp = paddle.exp(shard_logit_new)
        shard_demon = paddle.sum(shard_exp, axis=1, keepdim=True)
        global_demon = shard_demon
        paddle.distributed.all_reduce(global_demon,
                                      op=paddle.distributed.ReduceOp.SUM)

        global_log_demon = paddle.log(global_demon)
        shard_log_prob = shard_logit_new - global_log_demon
        shard_prob = paddle.exp(shard_log_prob)

        target_log_prob = paddle.min(shard_log_prob * shard_one_hot,
                                     axis=1,
                                     keepdim=True)
        shard_loss = paddle.scale(target_log_prob, scale=-1.0)
        #TODO paddle.distributed.reducescatter not found
        global_loss = paddle.fluid.layers.collective._c_reducescatter(
            shard_loss, nranks=self.nranks, use_calc_stream=True)
        return global_loss, shard_prob
Beispiel #11
0
def scaled_dot_product_attention(q,
                                 k,
                                 v,
                                 mask=None,
                                 dropout=0.0,
                                 training=True):
    r"""Scaled dot product attention with masking. 
    
    Assume that q, k, v all have the same leading dimensions (denoted as * in 
    descriptions below). Dropout is applied to attention weights before 
    weighted sum of values.

    Parameters
    -----------
    
    q : Tensor [shape=(\*, T_q, d)]
        the query tensor.
        
    k : Tensor [shape=(\*, T_k, d)]
        the key tensor.
        
    v : Tensor [shape=(\*, T_k, d_v)]
        the value tensor.
        
    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
        the mask tensor, zeros correspond to paddings. Defaults to None.
    
    Returns
    ----------
    out : Tensor [shape=(\*, T_q, d_v)] 
        the context vector.

    attn_weights : Tensor [shape=(\*, T_q, T_k)]
        the attention weights.
    """
    d = q.shape[-1]  # we only support imperative execution
    qk = paddle.matmul(q, k, transpose_y=True)
    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))

    if mask is not None:
        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here

    attn_weights = F.softmax(scaled_logit, axis=-1)
    attn_weights = F.dropout(attn_weights, dropout, training=training)
    out = paddle.matmul(attn_weights, v)
    return out, attn_weights
Beispiel #12
0
 def forward(self, query, key):
     q = self.q_proj(query)
     k = self.k_proj(key)
     q = paddle.reshape(q, shape=[0, 0, self.num_heads, self.embed_dim])
     k = paddle.reshape(k, shape=[0, 0, self.num_heads, self.embed_dim])
     q = paddle.transpose(q, perm=[0, 2, 1, 3])
     k = paddle.transpose(k, perm=[0, 2, 1, 3])
     scores = paddle.matmul(q, k, transpose_y=True)
     scores = paddle.scale(scores, scale=self.scale_value)
     return scores
Beispiel #13
0
    def forward(self, usr_var, mov_var):
        # 计算用户特征和电影特征
        user_features = self.get_usr_feat(usr_var)
        mov_features = self.get_mov_feat(mov_var)

        #使用余弦相似度算子,计算用户和电影的相似程度
        sim = F.cosine_similarity(user_features, mov_features,
                                  axis=1).reshape([-1, 1])
        # 将相似度扩大范围到和电影评分相同数据范围
        res = paddle.scale(sim, scale=5)
        return user_features, mov_features, res
Beispiel #14
0
 def forward(self, x):
     """
     forward
     """
     scale = self.config["scale"]
     if self.config['isTensor']:
         scale = paddle.to_tensor(scale)
     x = paddle.scale(x,
                      scale=scale,
                      bias=self.config["bias"],
                      bias_after_scale=self.config["bias_after_scale"])
     return x
Beispiel #15
0
    def _margin_softmax(input, label, out_dim, param_attr, margin1, margin2,
                        margin3, scale, sample_ratio):
        input_norm = paddle.sqrt(
            paddle.sum(paddle.square(input), axis=1, keepdim=True))
        input = paddle.divide(input, input_norm)

        if param_attr is None:
            param_attr = paddle.ParamAttr(
                initializer=paddle.nn.initializer.XavierNormal(fan_in=0.0))
        weight = paddle.static.create_parameter(
            shape=[input.shape[1], out_dim],
            dtype='float32',
            name=unique_name.generate('final_fc_w'),
            attr=param_attr)

        if sample_ratio < 1.0:
            # partial fc sample process
            label, sampled_class_index = class_center_sample(
                label, out_dim, ratio=sample_ratio, ignore_label=-1)
            sampled_class_index.stop_gradient = True
            weight = paddle.gather(weight, sampled_class_index, axis=1)
            out_dim = paddle.shape(sampled_class_index)

        weight_norm = paddle.sqrt(
            paddle.sum(paddle.square(weight), axis=0, keepdim=True))
        weight = paddle.divide(weight, weight_norm)
        cos = paddle.matmul(input, weight)

        theta = paddle.acos(cos)
        if margin1 != 1.0:
            theta = margin1 * theta
        if margin2 != 0.0:
            theta = theta + margin2
        margin_cos = paddle.cos(theta)
        if margin3 != 0.0:
            margin_cos = margin_cos - margin3

        one_hot = paddle.nn.functional.one_hot(label, num_classes=out_dim)
        diff = paddle.multiply(paddle.subtract(margin_cos, cos), one_hot)
        target_cos = paddle.add(cos, diff)
        logit = paddle.scale(target_cos, scale=scale)

        loss, prob = paddle.nn.functional.softmax_with_cross_entropy(
            logits=logit,
            label=paddle.reshape(label, (-1, 1)),
            return_softmax=True)
        avg_loss = paddle.mean(x=loss)

        one_hot.stop_gradient = True

        return avg_loss, prob
 def forward(self, src_word, src_pos):
     src_word_emb = src_word
     src_word_emb = fluid.layers.cast(src_word_emb, 'float32')
     src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
     src_pos = paddle.squeeze(src_pos, axis=-1)
     src_pos_enc = self.emb(src_pos)
     src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
     if self.dropout_rate:
         out = F.dropout(
             x=enc_input, p=self.dropout_rate, mode="downscale_in_infer")
     else:
         out = enc_input
     return out
Beispiel #17
0
    def forward(self, usr_var, mov_var):
        """定义个性化推荐算法的前向计算"""
        # 计算用户特征和电影特征
        user_features = self.get_usr_feature(usr_var)
        mov_features = self.get_movie_feature(mov_var)

        # 根据计算的特征计算相似度, reshape方便loss计算
        sim = F.common.cosine_similarity(user_features,
                                         mov_features).reshape([-1, 1])
        # 使用余弦相似度算子,计算用户和电影的相似程度
        # sim = F.cosine_similarity(user_features, mov_features, axis=1).reshape([-1, 1])
        # 将相似度扩大范围到和电影评分相同数据范围
        res = paddle.scale(sim, scale=5)

        return user_features, mov_features, res
Beispiel #18
0
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x)
        qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads))
        qkv = qkv.transpose((2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = paddle.scale(paddle.matmul(q, k, transpose_y=True), scale=self.scale)
        attn = F.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)

        x = paddle.matmul(attn, v)
        x = x.transpose((0, 2, 1, 3))
        x = x.reshape((B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
Beispiel #19
0
    def forward(self, hist_item_seq, hist_cat_seq, target_item, target_cat,
                label, mask, target_item_seq, target_cat_seq):
        hist_item_emb = self.hist_item_emb_attr(hist_item_seq)
        hist_cat_emb = self.hist_cat_emb_attr(hist_cat_seq)
        target_item_emb = self.target_item_emb_attr(target_item)
        target_cat_emb = self.target_cat_emb_attr(target_cat)
        target_item_seq_emb = self.target_item_seq_emb_attr(target_item_seq)
        target_cat_seq_emb = self.target_cat_seq_emb_attr(target_cat_seq)
        item_b = self.item_b_attr(target_item)

        hist_seq_concat = paddle.concat([hist_item_emb, hist_cat_emb], axis=2)
        target_seq_concat = paddle.concat(
            [target_item_seq_emb, target_cat_seq_emb], axis=2)
        target_concat = paddle.concat([target_item_emb, target_cat_emb],
                                      axis=1)

        concat = paddle.concat([
            hist_seq_concat, target_seq_concat, hist_seq_concat -
            target_seq_concat, hist_seq_concat * target_seq_concat
        ],
                               axis=2)

        for attlayer in self.attention_layer:
            concat = attlayer(concat)

        atten_fc3 = concat + mask
        atten_fc3 = paddle.transpose(atten_fc3, perm=[0, 2, 1])
        atten_fc3 = paddle.scale(atten_fc3, scale=self.firInDim**-0.5)
        weight = paddle.nn.functional.softmax(atten_fc3)

        output = paddle.matmul(weight, hist_seq_concat)

        output = paddle.reshape(output, shape=[0, self.firInDim])

        for firLayer in self.con_layer[:1]:
            concat = firLayer(output)

        embedding_concat = paddle.concat([concat, target_concat], axis=1)

        for colayer in self.con_layer[1:]:
            embedding_concat = colayer(embedding_concat)

        logit = embedding_concat + item_b
        return logit
Beispiel #20
0
    def forward(self, x, mask=None):
        """
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(
            (B_, N, 3, self.num_heads, C // self.num_heads)).transpose(
                (2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        attn = paddle.scale(paddle.matmul(q, k, transpose_y=True),
                            scale=self.scale)

        relative_position_bias = self.relative_position_bias_table.index_select(
            self.relative_position_index.flatten()).reshape(
                (self.window_size[0] * self.window_size[1],
                 self.window_size[0] * self.window_size[1],
                 -1))  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.transpose(
            (2, 0, 1))  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.reshape((B_ // nW, nW, self.num_heads, N,
                                 N)) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.reshape((-1, self.num_heads, N, N))
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = paddle.matmul(attn, v)
        x = x.transpose((0, 2, 1, 3))
        x = x.reshape((B_, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
Beispiel #21
0
    def forward(self, x, condition=None):
        """Forward pass of the ResidualBlock.

        Parameters
        -----------
        x : Tensor [shape=(B, C, T)]
            The input tensor.
             
        condition : Tensor, optional [shape(B, C_cond, T)]
            The condition. 
            
            It has been upsampled in time steps, so it has the same time steps 
            as the input does.(C_cond stands for the condition's channels). 
            Defaults to None.

        Returns
        -----------
        residual : Tensor [shape=(B, C, T)]
            The residual, which is used as the input to the next ResidualBlock.
            
        skip_connection : Tensor [shape=(B, C, T)]
            Tthe skip connection. This output is accumulated with that of 
            other ResidualBlocks. 
    """
        h = x

        # dilated conv
        h = self.conv(h)

        # condition
        if condition is not None:
            h += self.condition_proj(condition)

        # gated tanh
        content, gate = paddle.split(h, 2, axis=1)
        z = F.sigmoid(gate) * paddle.tanh(content)

        # projection
        residual = paddle.scale(z + x, math.sqrt(.5))
        skip_connection = z
        return residual, skip_connection
Beispiel #22
0
    def add_input(self, x, condition=None):
        """Take a step input and return a step output. 
        
        This method works similarily with ``forward`` but in a 
        ``step-in-step-out`` fashion.

        Parameters
        ----------
        x : Tensor [shape=(B, C)]
            Input for a step.
            
        condition : Tensor, optional [shape=(B, C_cond)]
            Condition for a step. Defaults to None.

        Returns
        ----------
        residual : Tensor [shape=(B, C)] 
            The residual for a step, which is used as the input to the next 
            layer of ResidualBlock.
            
        skip_connection : Tensor [shape=(B, C)]
            T he skip connection for a step. This output is accumulated with 
            that of other ResidualBlocks. 
        """
        h = x

        # dilated conv
        h = self.conv.add_input(h)

        # condition
        if condition is not None:
            h += self.condition_proj.add_input(condition)

        # gated tanh
        content, gate = paddle.split(h, 2, axis=1)
        z = F.sigmoid(gate) * paddle.tanh(content)

        # projection
        residual = paddle.scale(z + x, math.sqrt(0.5))
        skip_connection = z
        return residual, skip_connection
Beispiel #23
0
    def forward(self, src_ids, position_ids, sent_ids, input_mask, mask_pos):
        emb_out = self.word_emb(src_ids)
        position_embs_out = self.position_emb(position_ids)
        emb_out = emb_out + position_embs_out
        sent_emb_out = self.sent_emb(sent_ids)
        emb_out = emb_out + sent_emb_out
        emb_out = self.enc_pre_process_layer(emb_out)
        if self._dtype == "float16":
            input_mask = paddle.cast(x=input_mask, dtype=self._dtype)
        else:
            input_mask = paddle.cast(x=input_mask, dtype='float32')
        self_attn_mask = paddle.matmul(x=input_mask,
                                       y=input_mask,
                                       transpose_y=True)

        self_attn_mask = paddle.scale(x=self_attn_mask,
                                      scale=10000.0,
                                      bias=-1.0,
                                      bias_after_scale=False)
        n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self._n_head,
                                             axis=1)
        n_head_self_attn_mask.stop_gradient = True
        self._enc_out = self._enc_out_layer(enc_input=emb_out,
                                            attn_bias=n_head_self_attn_mask)
        mask_pos = paddle.cast(x=mask_pos, dtype='int32')
        reshaped_emb_out = paddle.reshape(x=self._enc_out,
                                          shape=[-1, self._emb_size])
        mask_feat = paddle.gather(x=reshaped_emb_out, index=mask_pos, axis=0)
        mask_trans_feat_out = self.mask_trans_feat(mask_feat)
        mask_trans_feat_out = self.mask_trans_act(mask_trans_feat_out)
        mask_trans_feat_out = self.mask_post_process_layer(
            out=mask_trans_feat_out)
        for name, param in self.named_parameters():
            if name == "word_emb.weight":
                y_tensor = param
                break
        fc_out = paddle.matmul(x=mask_trans_feat_out,
                               y=y_tensor,
                               transpose_y=True)
        fc_out += self.mask_lm_out_bias
        return fc_out
Beispiel #24
0
    def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None):
        token_emb_out = self.word_embedding_layer(token_ids)
        type_emb_out = self.sent_embedding_layer(type_ids)
        pos_emb_out = self.pos_embedding_layer(pos_ids)
        emb_out = token_emb_out + type_emb_out + pos_emb_out

        # auxiliary memory embeddings
        if aux_emb is not None:
            emb_out = paddle.concat([aux_emb, emb_out], axis=1)

        emb_out = self.dropout_layer(emb_out)

        # generate n-head self-attention mask
        self_attn_mask = input_mask
        self_attn_mask = paddle.scale(
            x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False)
        n_head_self_attn_mask = paddle.stack(
            x=[self_attn_mask] * self.n_head, axis=1)
        n_head_self_attn_mask.stop_gradient = True

        return emb_out, n_head_self_attn_mask
Beispiel #25
0
def local_response_norm(x,
                        size,
                        alpha=1e-4,
                        beta=0.75,
                        k=1.,
                        data_format="NCHW",
                        name=None):
    r"""
        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_

        The formula is as follows:

        .. math::

            Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}

        In the above equation:

        - :math:`size` : The number of channels to sum over.
        - :math:`k` : The offset (avoid being divided by 0).
        - :math:`\\alpha` : The scaling parameter.
        - :math:`\\beta` : The exponent parameter.


        Args:
            x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
            size (int): The number of channels to sum over.
            alpha (float, optional): The scaling parameter, positive. Default:1e-4
            beta (float, optional): The exponent, positive. Default:0.75
            k (float, optional): An offset, positive. Default: 1.0
            data_format (str, optional): Specify the data format of the input, and the data format of the output
                will be consistent with that of the input. An optional string from:
                If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
                If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
                If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
            name (str, optional): Name for the operation (optional, default is None). For more information,
                please refer to :ref:`api_guide_Name`.

        Returns:
            A tensor storing the transformation result with the same shape and data type as input.


        Examples:

        .. code-block:: python

            import paddle

            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
            y = paddle.nn.functional.local_response_norm(x, size=5)
            print(y.shape)  # [3, 3, 112, 112]
        """
    if not in_dygraph_mode():
        check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
    if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
        raise ValueError(
            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
            "but got {}".format(data_format))

    sizes = x.shape
    dim = len(sizes)
    if dim < 3:
        raise ValueError(
            'Expected 3D or higher dimensionality input, but got {} dimensions'
            .format(dim))

    for i, sz in enumerate(sizes):
        if not sz > 0:
            raise ValueError("Expected every dim's size to be larger than 0, "
                             "but the size of the {}-th dim is {}".format(
                                 i, sz))

    channel_last = True if data_format[-1] == "C" else False

    from functools import reduce
    sum_sizes = reduce(lambda x, y: x * y, sizes[1:])

    div = paddle.unsqueeze(paddle.multiply(x, x), axis=1)
    if not channel_last:
        pad4d_shape = [0, 0, size // 2, (size - 1) // 2]
        pool2d_shape = (size, 1)
        reshape_shape = [
            sizes[0], 1, sizes[1], sizes[2],
            int(sum_sizes / (sizes[1] * sizes[2]))
        ]
        pad5d_shape = [0, 0, 0, 0, size // 2, (size - 1) // 2]
        pool3d_shape = (size, 1, 1)
    else:
        pad4d_shape = [size // 2, (size - 1) // 2, 0, 0]
        pool2d_shape = (1, size)
        reshape_shape = [
            sizes[0], 1, sizes[1],
            int(sum_sizes / (sizes[1] * sizes[-1])), sizes[-1]
        ]
        pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0]
        pool3d_shape = (1, 1, size)

    if dim == 3:
        div = paddle.nn.functional.pad(div, pad=pad4d_shape)
        div = paddle.nn.functional.avg_pool2d(div,
                                              kernel_size=pool2d_shape,
                                              stride=1)
        div = paddle.squeeze(div, axis=1)
    else:
        div = paddle.reshape(div, shape=reshape_shape)
        div = paddle.nn.functional.pad(div,
                                       pad=pad5d_shape,
                                       data_format='NCDHW')
        div = paddle.nn.functional.avg_pool3d(div,
                                              kernel_size=pool3d_shape,
                                              stride=1)
        div = paddle.reshape(paddle.squeeze(div, axis=1), sizes)

    div = paddle.scale(div, scale=alpha, bias=k)
    div = paddle.pow(div, beta)
    res = paddle.divide(x, div, name=name)
    return res
Beispiel #26
0
    def margin_softmax_classify(self,
                                x,
                                label,
                                margin1=1.0,
                                margin2=0.5,
                                margin3=0.0,
                                logit_scale=64,
                                param_attr=None):
        '''
        reference: ArcFace. https://arxiv.org/abs/1801.07698
        '''
        flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1)
        weight, bias = self.create_parameter(dtype=x.dtype,
                                             in_dim=flatten_dim,
                                             param_attr=param_attr,
                                             use_bias=False)

        # normalize x
        x_l2 = paddle.sqrt(paddle.sum(paddle.square(x), axis=1, keepdim=True))
        norm_x = paddle.divide(x, x_l2)

        norm_x_list = []
        paddle.distributed.all_gather(norm_x_list, norm_x)
        norm_x_all = paddle.concat(norm_x_list, axis=0)

        label_list = []
        paddle.distributed.all_gather(label_list, label)
        label_all = paddle.concat(label_list, axis=0)
        label_all.stop_gradient = True

        label_all = paddle.reshape(label_all, (-1, 1))
        shard_label = paddle.shard_index(label_all,
                                         index_num=self.nclasses,
                                         nshards=self.nranks,
                                         shard_id=self.rank_id,
                                         ignore_value=-1)
        shard_label = paddle.reshape(shard_label, (-1, ))
        # TODO check necessary
        shard_label.stop_gradient = True

        if self.sample_ratio < 1.0:
            # partial fc sample process
            shard_label, sampled_class_index = class_center_sample(
                shard_label,
                self.shard_dim,
                ratio=self.sample_ratio,
                ignore_label=-1)
            sampled_class_index.stop_gradient = True
            weight = paddle.gather(weight, sampled_class_index, axis=1)
            shard_dim = paddle.shape(sampled_class_index)
        else:
            shard_dim = self.shard_dim

        # normalize weight
        weight_l2 = paddle.sqrt(
            paddle.sum(paddle.square(weight), axis=0, keepdim=True))
        norm_weight = paddle.divide(weight, weight_l2)

        shard_cos = paddle.matmul(norm_x_all, norm_weight)

        theta = paddle.acos(shard_cos)
        if margin1 != 1.0:
            theta = margin1 * theta
        if margin2 != 0.0:
            theta = theta + margin2
        margin_cos = paddle.cos(theta)
        if margin3 != 0.0:
            margin_cos = margin_cos - margin3

        shard_one_hot = paddle.nn.functional.one_hot(shard_label,
                                                     num_classes=shard_dim)
        # TODO check necessary
        shard_one_hot.stop_gradient = True

        diff = paddle.multiply(paddle.subtract(margin_cos, shard_cos),
                               shard_one_hot)
        shard_target_cos = paddle.add(shard_cos, diff)
        shard_logit = paddle.scale(shard_target_cos, scale=logit_scale)

        global_loss, shard_prob = self.softmax_with_cross_entropy(
            shard_logit, shard_one_hot)
        avg_loss = paddle.mean(global_loss)

        avg_loss._set_info('shard_logit', shard_logit)
        avg_loss._set_info('shard_prob', shard_prob)
        avg_loss._set_info('shard_label', shard_label)
        avg_loss._set_info('shard_dim', shard_dim)

        return avg_loss
Beispiel #27
0
 def _executed_api(self, x, scale=1.0, bias=0.0):
     return paddle.scale(x, scale, bias)
Beispiel #28
0
 def scale2(inputs):
     return paddle.scale(inputs, scale=3, bias=2.1, act="gelu")
Beispiel #29
0
 def scale1(inputs):
     return paddle.scale(inputs, scale=2.0, bias=1.0)
def parallel_self_attention(input,
                            hidden_size,
                            num_attention_heads,
                            mp_rank,
                            mp_nranks,
                            dtype="float32",
                            ring_id=0):
    assert hidden_size % mp_nranks == 0
    hidden_size_per_part = hidden_size // mp_nranks
    assert hidden_size % num_attention_head == 0
    hidden_size_per_head = hidden_size // num_attention_heads
    assert num_attention_heads % mp_nranks == 0
    num_attention_head_per_part = num_attention_heads // mp_nranks

    query_key_value = column_parallel_linear(input,
                                             hidden_size,
                                             hidden_size * 3,
                                             use_bias=False,
                                             gather_out=False,
                                             mp_rank=mp_rank,
                                             mp_nranks=mp_nranks,
                                             dtype=dtype,
                                             ring_id=ring_id)
    # [sq, b, (np * hn * 3)] -> [sq, b, np, 3 * bn]
    new_shape = query_key_value.shape()[:-1] + (num_attention_head_per_part,
                                                3 * hidden_size_per_part)
    query_key_value = paddle.reshape(query_key_value, new_shape)

    # [sq, b, np, 3*bn] -> 3 [sq, b, np, bn]
    (query, key, value) = paddle.split(query_key_value, 3)

    # [b, np, sq, sk]
    output_size = (query.shape[1], query.shape[2], query.shape[0],
                   key.shape[0])

    # [sq, b, np, bn] -> [sq, b * np, hn]
    query = paddle.reshape(
        query, (output_size[2], output_size[0] * output_size[1], -1))
    key = paddle.reshape(key,
                         (output_size[3], output_size[0] * output_size[1], -1))
    result = paddle.bmm(paddle.transpose(query, [1, 0, 2]),
                        paddle.transpose(query, [1, 2, 0]))
    result = paddle.scale(result, 1.0 / norm_factor)
    scores = paddle.reshape(result, output_size)

    # [b, np, sq, hn]
    output_size = (value.shape[1], value.shape[2], query.shape[0],
                   value.shape[3])

    # [sk, b * np, hn]
    value = paddle.reshape(
        value, [value.shape[0], output_size[0] * output_size[1], -1])

    # [b * np, sq, sk]
    attention_probs = paddle.reshape(
        attention_probs, [output_size[0] * output_size[1], output_size[2], -1])

    [b * np, sq, bn]
    context = paddle.bmm(attention_probs, paddle.transpose(value, [1, 0, 2]))

    # [b, np, sq, hn]
    context = paddle.reshape(context, output_size)

    #[b, np, sq, hn] -> [sq, b, np, hn]
    context = paddle.transpose(context, [2, 0, 1, 3])

    # [sq, b, np, hn] -> [sq, b, hp]
    new_shape = context.shape[:-2] + (hidden_size_per_part, )
    context = paddle.reshape(context, new_shape)

    # Output: [sq, b, h]
    output, bias = row_parallel_linear()