def attention(Q,
              K,
              V,
              Q_lengths,
              K_lengths,
              attention_type='dot',
              is_mask=True,
              mask_value=-2**32 + 1,
              drop_prob=None):
    '''Add attention layer.
    Args:
        Q: a tensor with shape [batch, Q_time, Q_dimension]
        K: a tensor with shape [batch, time, K_dimension]
        V: a tensor with shape [batch, time, V_dimension]

        Q_length: a tensor with shape [batch]
        K_length: a tensor with shape [batch]

    Returns:
        a tensor with shape [batch, Q_time, V_dimension]

    Raises:
        AssertionError: if
            Q_dimension not equal to K_dimension when attention type is dot.
    '''
    assert attention_type in ('dot', 'bilinear')
    if attention_type == 'dot':
        assert Q.shape[-1] == K.shape[-1]

    Q_time = Q.shape[1]
    K_time = K.shape[1]

    if attention_type == 'dot':
        logits = op.dot_sim(Q, K)  #[batch, Q_time, time]
    if attention_type == 'bilinear':
        logits = op.bilinear_sim(Q, K)

    if is_mask:
        mask = op.mask(Q_lengths, K_lengths, Q_time,
                       K_time)  #return [batch, Q_time, K_time]
        # mask is a tensor with the same shape with logits
        # where the real word location is labeled by 1
        # where the padded/masked word location is labeled by 0
        # mask * logits is element-wise product
        # + (1 - mask) is to add very small negative value on
        # masked positions (0). after softmax, this position becomes 0
        # similar tricks also used in BERT
        logits = mask * logits + (1 - mask) * mask_value

    attention = tf.nn.softmax(logits)

    if drop_prob is not None:
        print('use attention drop')
        attention = tf.nn.dropout(attention, drop_prob)

    return op.weighted_sum(attention, V)
Exemple #2
0
def attention(Q,
              K,
              V,
              Q_lengths,
              K_lengths,
              attention_type='dot',
              is_mask=True,
              mask_value=-2**32 + 1,
              drop_prob=None):
    '''Add attention layer.
    Args:
        Q: a tensor with shape [batch, Q_time, Q_dimension]
        K: a tensor with shape [batch, time, K_dimension]
        V: a tensor with shape [batch, time, V_dimension]

        Q_length: a tensor with shape [batch]
        K_length: a tensor with shape [batch]

    Returns:
        a tensor with shape [batch, Q_time, V_dimension]

    Raises:
        AssertionError: if
            Q_dimension not equal to K_dimension when attention type is dot.
    '''
    assert attention_type in ('dot', 'bilinear')
    if attention_type == 'dot':
        assert Q.shape[-1] == K.shape[-1]

    Q_time = Q.shape[1]
    K_time = K.shape[1]

    if attention_type == 'dot':
        logits = op.dot_sim(Q, K)  #[batch, Q_time, time]
    if attention_type == 'bilinear':
        logits = op.bilinear_sim(Q, K)

    if is_mask:
        mask = op.mask(Q_lengths, K_lengths, Q_time,
                       K_time)  #[batch, Q_time, K_time]
        logits = mask * logits + (1 - mask) * mask_value

    attention = tf.nn.softmax(logits)

    if drop_prob is not None:
        print('use attention drop')
        attention = tf.nn.dropout(attention, drop_prob)

    return op.weighted_sum(attention, V)
Exemple #3
0
def attention(Q, K, V, attention_type='dot', drop_prob=None):
    '''Add attention layer.'''
    assert attention_type in ('dot', 'bilinear')
    if attention_type == 'dot':
        assert Q.shape[-1] == K.shape[-1]

    Q_time = Q.shape[1]
    K_time = K.shape[1]

    if attention_type == 'dot':
        logits = op.dot_sim(Q, K)  #[batch, Q_time, K_time]
    if attention_type == 'bilinear':
        logits = op.bilinear_sim(Q, K)

    attention = tf.nn.softmax(logits)

    if drop_prob is not None:
        attention = tf.nn.dropout(attention, drop_prob)

    return op.weighted_sum(attention, V)