def test_ffnn(self): with tf.Graph().as_default(): input_emb = tf.random_uniform([3, 5, 8]) output_emb = common_layers.ffnn(input_emb=input_emb, hidden_sizes=[7, 9], dropout_ratio=0.2, mode=tf.estimator.ModeKeys.TRAIN) with tf.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) actual_output_emb = sess.run(output_emb) self.assertAllEqual(actual_output_emb.shape, [3, 5, 9])
def decomposable_attention(emb1, len1, emb2, len2, hidden_size, hidden_layers, dropout_ratio, mode, epsilon=1e-8): """See https://arxiv.org/abs/1606.01933. Args: emb1: A Tensor with shape [batch_size, max_len1, emb_size] representing the first input sequence. len1: A Tensor with shape [batch_size], indicating the true sequence length of `emb1`. This is required due to padding. emb2: A Tensor with shape [batch_size, max_len2, emb_size] representing the second input sequence. len2: A Tensor with shape [batch_size], indicating the true sequence length of `emb1`. This is required due to padding. hidden_size: An integer indicating the size of each hidden layer in the feed-forward neural networks. hidden_layers: An integer indicating the number of hidden layers in the feed-forward neural networks. dropout_ratio: The probability of dropping out each unit in the activation. This can be None, and is only applied during training. mode: One of the keys from tf.estimator.ModeKeys. epsilon: A small positive constant to add to masks for numerical stability. Returns: final_emb: A Tensor with shape [batch_size, hidden_size]. """ # [batch_size, maxlen1] mask1 = tf.sequence_mask(len1, tensor_utils.shape(emb1, 1), dtype=tf.float32) # [batch_size, maxlen2] mask2 = tf.sequence_mask(len2, tensor_utils.shape(emb2, 1), dtype=tf.float32) with tf.variable_scope("attend"): projected_emb1 = common_layers.ffnn(emb1, [hidden_size] * hidden_layers, dropout_ratio, mode) with tf.variable_scope("attend", reuse=True): projected_emb2 = common_layers.ffnn(emb2, [hidden_size] * hidden_layers, dropout_ratio, mode) # [batch_size, maxlen1, maxlen2] attention_scores = tf.matmul(projected_emb1, projected_emb2, transpose_b=True) attention_weights1 = tf.nn.softmax( attention_scores + tf.log(tf.expand_dims(mask2, 1) + epsilon), 2) attention_weights2 = tf.nn.softmax( attention_scores + tf.log(tf.expand_dims(mask1, 2) + epsilon), 1) # [batch_size, maxlen1, emb_size] attended_emb1 = tf.matmul(attention_weights1, emb2) # [batch_size, maxlen2, emb_size] attended_emb2 = tf.matmul(attention_weights2, emb1, transpose_a=True) with tf.variable_scope("compare"): compared_emb1 = common_layers.ffnn( tf.concat([emb1, attended_emb1], -1), [hidden_size] * hidden_layers, dropout_ratio, mode) with tf.variable_scope("compare", reuse=True): compared_emb2 = common_layers.ffnn( tf.concat([emb2, attended_emb2], -1), [hidden_size] * hidden_layers, dropout_ratio, mode) compared_emb1 *= tf.expand_dims(mask1, -1) compared_emb2 *= tf.expand_dims(mask2, -1) # [batch_size, hidden_size] aggregated_emb1 = tf.reduce_sum(compared_emb1, 1) aggregated_emb2 = tf.reduce_sum(compared_emb2, 1) with tf.variable_scope("aggregate"): final_emb = common_layers.ffnn( tf.concat([aggregated_emb1, aggregated_emb2], -1), [hidden_size] * hidden_layers, dropout_ratio, mode) return final_emb