Beispiel #1
0
def argmax_attentive_matching(a,
                              b,
                              a_lengths,
                              b_lengths,
                              max_seq_len,
                              attention_func=dot_attention,
                              attention_func_kwargs={}):
    """
    Matches each vector in a with the weighted vector in b that has the largest inner product.
    The weightings are determined by the attention matrix.  The attention matrix is computed
    using attention_func.
    Args:
        a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
        b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
        a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
        b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
        max_seq_len: Length of padded sequences a and b.  Integer.
        attention_func: Function used to calculate attention matrix.  Can be one of the following:
            multiplicative_attention, additive_attention, concat_attention, dot_attention,
            or cosine_attention.
        attention_func_kwargs: Keyword arguments to pass to attention_func.
    Returns:
        Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for
        each timestep in a.
    """
    attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len,
                          **attention_func_kwargs)
    b_match_idx = tf.argmax(attn, axis=2)
    batch_index = tf.tile(
        tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1),
        (1, max_seq_len))
    b_idx = tf.stack([batch_index, b_match_idx], axis=2)
    return tf.gather_nd(b, b_idx)
Beispiel #2
0
def dense_layer(inputs,
                output_units,
                bias=True,
                activation=None,
                dropout=None,
                scope='dense-layer',
                reuse=False):
    """
    Applies a dense layer to a 2D tensor of shape [batch_size, input_units]
    to produce a tensor of shape [batch_size, output_units].
    Args:
        inputs: Tensor of shape [batch size, input_units].
        output_units: Number of output units.
        activation: activation function.
        dropout: dropout keep prob.
    Returns:
        Tensor of shape [batch size, output_units].
    """
    with tf.variable_scope(scope, reuse=reuse):
        W = tf.get_variable(
            name='weights',
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            shape=[shape(inputs, -1), output_units])
        z = tf.matmul(inputs, W)
        if bias:
            b = tf.get_variable(name='biases',
                                initializer=tf.constant_initializer(),
                                shape=[output_units])
            z = z + b
        z = activation(z) if activation else z
        z = tf.nn.dropout(z, dropout) if dropout else z
        return z
Beispiel #3
0
def time_distributed_dense_layer(inputs,
                                 output_units,
                                 bias=True,
                                 activation=None,
                                 dropout=None,
                                 scope='time-distributed-dense-layer',
                                 reuse=False):
    """
    Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units]
    to produce a tensor of shape [batch_size, max_seq_len, output_units].
    Args:
        inputs: Tensor of shape [batch size, max sequence length, ...].
        output_units: Number of output units.
        activation: activation function.
        dropout: dropout keep prob.
    Returns:
        Tensor of shape [batch size, max sequence length, output_units].
    """
    with tf.variable_scope(scope, reuse=reuse):
        W = tf.get_variable(
            name='weights',
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            shape=[shape(inputs, -1), output_units])
        z = tf.einsum('ijk,kl->ijl', inputs, W)
        if bias:
            b = tf.get_variable(name='biases',
                                initializer=tf.constant_initializer(),
                                shape=[output_units])
            z = z + b
        z = activation(z) if activation else z
        z = tf.nn.dropout(z, dropout) if dropout else z
        return z
Beispiel #4
0
def temporal_convolution_layer(inputs,
                               output_units,
                               convolution_width,
                               bias=True,
                               activation=None,
                               dropout=None,
                               scope='time-distributed-conv-layer',
                               reuse=False):
    """
    Convolution over the temporal axis of sequence data.
    Args:
        inputs: Tensor of shape [batch size, max sequence length, input_units].
        output_units: Output channels for convolution.
        convolution_width: Number of timesteps (words) to use in convolution.
    Returns:
        Tensor of shape [batch size, max sequence length, output_units].
    """
    with tf.variable_scope(scope, reuse=reuse):
        W = tf.get_variable(
            name='weights',
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            shape=[convolution_width,
                   shape(inputs, 2), output_units])

        z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1])
        if bias:
            b = tf.get_variable(name='biases',
                                initializer=tf.constant_initializer(),
                                shape=[output_units])
            z = z + b
        z = activation(z) if activation else z
        z = tf.nn.dropout(z, dropout) if dropout else z
        return z
Beispiel #5
0
def concat_attention(a,
                     b,
                     a_lengths,
                     b_lengths,
                     max_seq_len,
                     hidden_units=150,
                     scope='concat-attention',
                     reuse=False):
    """
    For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
    where attn(i, j) = dot(v, tanh(W*[a_i; b_j])).  v is a learnable vector and W is a learnable
    matrix.  The rows of attn are softmax normalized.
    Args:
        a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
        b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
        a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
        b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
        max_seq_len: Length of padded sequences a and b.  Integer.
        hidden_units: Number of hidden units.  Integer.
    Returns:
        Attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
    """
    with tf.variable_scope(scope, reuse=reuse):
        seq_len = a.shape[1]
        a = tf.tile(tf.expand_dims(a, 2), [1, 1, seq_len, 1])
        b = tf.tile(tf.expand_dims(b, 1), [1, seq_len, 1, 1])
        c = tf.concat([a, b], axis=3)
        W = tf.get_variable(
            name='matmul_weights',
            initializer=tf.contrib.layers.variance_scaling_initializer(),
            shape=[shape(c, -1), hidden_units])
        cW = tf.einsum('ijkl,lm->ijkm', c, W)
        v = tf.get_variable(name='dot_weights',
                            initializer=tf.ones_initializer(),
                            shape=[hidden_units])
        logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v)
        logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
        attn = tf.exp(logits)
        attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
        return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)