def softmax_sampling(self, scores, mask, eta): scores = scores * mask scores_padded = layers.squeeze( fluid_sequence_pad(scores, 0, maxlen=128), [2]) # (b*s, 1) -> (b, s, 1) -> (b, s) mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128), [2]) seq_lens = fluid_sequence_get_seq_len(scores) def normalize(scores_padded, mask_padded): mean_S = layers.reduce_sum(scores_padded, dim=1, keep_dim=True) / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) S = scores_padded - mean_S std_S = layers.sqrt( layers.reduce_sum(layers.square(S * mask_padded), dim=1, keep_dim=True)) return S / (std_S + self.SAFE_EPS) norm_S = normalize(scores_padded, mask_padded) # set mask to large negative values norm_S = norm_S * mask_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE soft_prob = layers.softmax(norm_S / eta) * mask_padded sampled_id = layers.reshape(layers.sampling_id(soft_prob), [-1, 1]) max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64') sampled_id = layers.elementwise_min(sampled_id, max_id) return layers.cast(sampled_id, 'int64')
def eps_greedy_sampling(self, scores, mask, eps): scores = scores * mask scores_padded = layers.squeeze( fluid_sequence_pad(scores, 0, maxlen=128), [2]) # (b*s, 1) -> (b, s, 1) -> (b, s) mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128), [2]) seq_lens = fluid_sequence_get_seq_len(scores) def get_greedy_prob(scores_padded, mask_padded): s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE max_value = layers.reduce_max(s, dim=1, keep_dim=True) greedy_prob = layers.cast(s >= max_value, 'float32') return greedy_prob greedy_prob = get_greedy_prob(scores_padded, mask_padded) eps_prob = mask_padded * eps / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) final_prob = (greedy_prob + eps_prob) * mask_padded final_prob = final_prob / layers.reduce_sum( final_prob, dim=1, keep_dim=True) sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1]) max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64') sampled_id = layers.elementwise_min(sampled_id, max_id) return layers.cast(sampled_id, 'int64')
def get_seq_len_mask(input, atten_input, max_seq_len, max_atten_seq_len): ones = layers.reduce_sum( input, dim=1, keep_dim=True) * 0 + 1 # (batch*seq_len, 1) atten_ones = layers.reduce_sum(atten_input, dim=1, keep_dim=True) * 0 + 1 ones_padded = fluid_sequence_pad( ones, 0, max_seq_len) # (batch, seq_len, 1) atten_ones_padded = fluid_sequence_pad(atten_ones, 0, max_atten_seq_len) seq_len_mask = layers.matmul( ones_padded, layers.transpose(atten_ones_padded, perm=[0, 2, 1])) seq_len_mask.stop_gradient = True return seq_len_mask # (batch, seq_len, atten_seq_len)
def _attention_norm(self, is_test, atten_op, norm_op, input, atten_input, max_seq_len, max_atten_seq_len, num_head): """ If QK_type is relu or softsign, no need to use maxlen for sequence_pad. args: input: (batch*seq_len, dim), 1-level lod tensor atten_input: (batch*atten_seq_len, dim), 1-level lod tensor returns: output: (batch*seq_len, dim), 1-level lod tensor """ def get_seq_len_mask(input, atten_input, max_seq_len, max_atten_seq_len): ones = layers.reduce_sum( input, dim=1, keep_dim=True) * 0 + 1 # (batch*seq_len, 1) atten_ones = layers.reduce_sum(atten_input, dim=1, keep_dim=True) * 0 + 1 ones_padded = fluid_sequence_pad( ones, 0, max_seq_len) # (batch, seq_len, 1) atten_ones_padded = fluid_sequence_pad(atten_ones, 0, max_atten_seq_len) seq_len_mask = layers.matmul( ones_padded, layers.transpose(atten_ones_padded, perm=[0, 2, 1])) seq_len_mask.stop_gradient = True return seq_len_mask # (batch, seq_len, atten_seq_len) seq_lens = fluid_sequence_get_seq_len(input) ### padding input_padded = fluid_sequence_pad( input, 0, max_seq_len) # (batch, max_seq_len, dim) atten_input_padded = fluid_sequence_pad( atten_input, 0, max_atten_seq_len) # (batch, max_recent_seq_len, dim) mask = get_seq_len_mask(input, atten_input, max_seq_len, max_atten_seq_len) atten_out = atten_op(input_padded, atten_input_padded, atten_input_padded, num_head, mask) ### flatten and unpad output = layers.sequence_unpad(atten_out, seq_lens) ### residual and normalize output = norm_op(input + output, is_test) output = layers.dropout(output, dropout_prob=self._dropout_prob, is_test=is_test) return output