Exemple #1
0
    def softmax_sampling(self, scores, mask, eta):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def normalize(scores_padded, mask_padded):
            mean_S = layers.reduce_sum(scores_padded, dim=1,
                                       keep_dim=True) / layers.reduce_sum(
                                           mask_padded, dim=1, keep_dim=True)
            S = scores_padded - mean_S
            std_S = layers.sqrt(
                layers.reduce_sum(layers.square(S * mask_padded),
                                  dim=1,
                                  keep_dim=True))
            return S / (std_S + self.SAFE_EPS)

        norm_S = normalize(scores_padded, mask_padded)
        # set mask to large negative values
        norm_S = norm_S * mask_padded - (mask_padded *
                                         (-1) + 1) * self.BIG_VALUE
        soft_prob = layers.softmax(norm_S / eta) * mask_padded
        sampled_id = layers.reshape(layers.sampling_id(soft_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Exemple #2
0
    def eps_greedy_sampling(self, scores, mask, eps):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def get_greedy_prob(scores_padded, mask_padded):
            s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE
            max_value = layers.reduce_max(s, dim=1, keep_dim=True)
            greedy_prob = layers.cast(s >= max_value, 'float32')
            return greedy_prob

        greedy_prob = get_greedy_prob(scores_padded, mask_padded)
        eps_prob = mask_padded * eps / layers.reduce_sum(
            mask_padded, dim=1, keep_dim=True)

        final_prob = (greedy_prob + eps_prob) * mask_padded
        final_prob = final_prob / layers.reduce_sum(
            final_prob, dim=1, keep_dim=True)

        sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Exemple #3
0
 def get_seq_len_mask(input, atten_input, max_seq_len,
                      max_atten_seq_len):
     ones = layers.reduce_sum(
         input, dim=1, keep_dim=True) * 0 + 1  # (batch*seq_len, 1)
     atten_ones = layers.reduce_sum(atten_input, dim=1,
                                    keep_dim=True) * 0 + 1
     ones_padded = fluid_sequence_pad(
         ones, 0, max_seq_len)  # (batch, seq_len, 1)
     atten_ones_padded = fluid_sequence_pad(atten_ones, 0,
                                            max_atten_seq_len)
     seq_len_mask = layers.matmul(
         ones_padded, layers.transpose(atten_ones_padded,
                                       perm=[0, 2, 1]))
     seq_len_mask.stop_gradient = True
     return seq_len_mask  # (batch, seq_len, atten_seq_len)
Exemple #4
0
    def _attention_norm(self, is_test, atten_op, norm_op, input, atten_input,
                        max_seq_len, max_atten_seq_len, num_head):
        """
        If QK_type is relu or softsign, no need to use maxlen for sequence_pad.
        args:
            input: (batch*seq_len, dim), 1-level lod tensor
            atten_input: (batch*atten_seq_len, dim), 1-level lod tensor
        returns:
            output: (batch*seq_len, dim), 1-level lod tensor
        """
        def get_seq_len_mask(input, atten_input, max_seq_len,
                             max_atten_seq_len):
            ones = layers.reduce_sum(
                input, dim=1, keep_dim=True) * 0 + 1  # (batch*seq_len, 1)
            atten_ones = layers.reduce_sum(atten_input, dim=1,
                                           keep_dim=True) * 0 + 1
            ones_padded = fluid_sequence_pad(
                ones, 0, max_seq_len)  # (batch, seq_len, 1)
            atten_ones_padded = fluid_sequence_pad(atten_ones, 0,
                                                   max_atten_seq_len)
            seq_len_mask = layers.matmul(
                ones_padded, layers.transpose(atten_ones_padded,
                                              perm=[0, 2, 1]))
            seq_len_mask.stop_gradient = True
            return seq_len_mask  # (batch, seq_len, atten_seq_len)

        seq_lens = fluid_sequence_get_seq_len(input)
        ### padding
        input_padded = fluid_sequence_pad(
            input, 0, max_seq_len)  # (batch, max_seq_len, dim)
        atten_input_padded = fluid_sequence_pad(
            atten_input, 0,
            max_atten_seq_len)  # (batch, max_recent_seq_len, dim)
        mask = get_seq_len_mask(input, atten_input, max_seq_len,
                                max_atten_seq_len)
        atten_out = atten_op(input_padded, atten_input_padded,
                             atten_input_padded, num_head, mask)
        ### flatten and unpad
        output = layers.sequence_unpad(atten_out, seq_lens)
        ### residual and normalize
        output = norm_op(input + output, is_test)
        output = layers.dropout(output,
                                dropout_prob=self._dropout_prob,
                                is_test=is_test)
        return output