def attention(Q, K, V, Q_lengths, K_lengths, attention_type='dot', is_mask=True, mask_value=-2**32 + 1, drop_prob=None): '''Add attention layer. Args: Q: a tensor with shape [batch, Q_time, Q_dimension] K: a tensor with shape [batch, time, K_dimension] V: a tensor with shape [batch, time, V_dimension] Q_length: a tensor with shape [batch] K_length: a tensor with shape [batch] Returns: a tensor with shape [batch, Q_time, V_dimension] Raises: AssertionError: if Q_dimension not equal to K_dimension when attention type is dot. ''' assert attention_type in ('dot', 'bilinear') if attention_type == 'dot': assert Q.shape[-1] == K.shape[-1] Q_time = Q.shape[1] K_time = K.shape[1] if attention_type == 'dot': logits = op.dot_sim(Q, K) #[batch, Q_time, time] if attention_type == 'bilinear': logits = op.bilinear_sim(Q, K) if is_mask: mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #return [batch, Q_time, K_time] # mask is a tensor with the same shape with logits # where the real word location is labeled by 1 # where the padded/masked word location is labeled by 0 # mask * logits is element-wise product # + (1 - mask) is to add very small negative value on # masked positions (0). after softmax, this position becomes 0 # similar tricks also used in BERT logits = mask * logits + (1 - mask) * mask_value attention = tf.nn.softmax(logits) if drop_prob is not None: print('use attention drop') attention = tf.nn.dropout(attention, drop_prob) return op.weighted_sum(attention, V)
def attention(Q, K, V, Q_lengths, K_lengths, attention_type='dot', is_mask=True, mask_value=-2**32 + 1, drop_prob=None): '''Add attention layer. Args: Q: a tensor with shape [batch, Q_time, Q_dimension] K: a tensor with shape [batch, time, K_dimension] V: a tensor with shape [batch, time, V_dimension] Q_length: a tensor with shape [batch] K_length: a tensor with shape [batch] Returns: a tensor with shape [batch, Q_time, V_dimension] Raises: AssertionError: if Q_dimension not equal to K_dimension when attention type is dot. ''' assert attention_type in ('dot', 'bilinear') if attention_type == 'dot': assert Q.shape[-1] == K.shape[-1] Q_time = Q.shape[1] K_time = K.shape[1] if attention_type == 'dot': logits = op.dot_sim(Q, K) #[batch, Q_time, time] if attention_type == 'bilinear': logits = op.bilinear_sim(Q, K) if is_mask: mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time] logits = mask * logits + (1 - mask) * mask_value attention = tf.nn.softmax(logits) if drop_prob is not None: print('use attention drop') attention = tf.nn.dropout(attention, drop_prob) return op.weighted_sum(attention, V)
def attention(Q, K, V, attention_type='dot', drop_prob=None): '''Add attention layer.''' assert attention_type in ('dot', 'bilinear') if attention_type == 'dot': assert Q.shape[-1] == K.shape[-1] Q_time = Q.shape[1] K_time = K.shape[1] if attention_type == 'dot': logits = op.dot_sim(Q, K) #[batch, Q_time, K_time] if attention_type == 'bilinear': logits = op.bilinear_sim(Q, K) attention = tf.nn.softmax(logits) if drop_prob is not None: attention = tf.nn.dropout(attention, drop_prob) return op.weighted_sum(attention, V)