def argmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, attention_func_kwargs={}): """ Matches each vector in a with the weighted vector in b that has the largest inner product. The weightings are determined by the attention matrix. The attention matrix is computed using attention_func. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. attention_func: Function used to calculate attention matrix. Can be one of the following: multiplicative_attention, additive_attention, concat_attention, dot_attention, or cosine_attention. attention_func_kwargs: Keyword arguments to pass to attention_func. Returns: Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for each timestep in a. """ attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) b_match_idx = tf.argmax(attn, axis=2) batch_index = tf.tile( tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1), (1, max_seq_len)) b_idx = tf.stack([batch_index, b_match_idx], axis=2) return tf.gather_nd(b, b_idx)
def dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='dense-layer', reuse=False): """ Applies a dense layer to a 2D tensor of shape [batch_size, input_units] to produce a tensor of shape [batch_size, output_units]. Args: inputs: Tensor of shape [batch size, input_units]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units]) z = tf.matmul(inputs, W) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='time-distributed-dense-layer', reuse=False): """ Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units] to produce a tensor of shape [batch_size, max_seq_len, output_units]. Args: inputs: Tensor of shape [batch size, max sequence length, ...]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units]) z = tf.einsum('ijk,kl->ijl', inputs, W) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def temporal_convolution_layer(inputs, output_units, convolution_width, bias=True, activation=None, dropout=None, scope='time-distributed-conv-layer', reuse=False): """ Convolution over the temporal axis of sequence data. Args: inputs: Tensor of shape [batch size, max sequence length, input_units]. output_units: Output channels for convolution. convolution_width: Number of timesteps (words) to use in convolution. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[convolution_width, shape(inputs, 2), output_units]) z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1]) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='concat-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*[a_i; b_j])). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): seq_len = a.shape[1] a = tf.tile(tf.expand_dims(a, 2), [1, 1, seq_len, 1]) b = tf.tile(tf.expand_dims(b, 1), [1, seq_len, 1, 1]) c = tf.concat([a, b], axis=3) W = tf.get_variable( name='matmul_weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(c, -1), hidden_units]) cW = tf.einsum('ijkl,lm->ijkm', c, W) v = tf.get_variable(name='dot_weights', initializer=tf.ones_initializer(), shape=[hidden_units]) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)