def bilinear_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): with tf.variable_scope(scope+"_Context_to_Query_Attention_Layer", reuse=reuse): context_ = tf.transpose(context, [0,2,1]) hidden_dim = query.get_shape()[-1] attn_W = tf.get_variable("AttnW", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]]) S = tf.matmul(weighted_query, context_) # batch x q_len x c_len mask_q = tf.expand_dims(query_mask, 1) mask_c = tf.expand_dims(context_mask, 1) S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) return c2q, q2c
def dot_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): hidden_dim = query.get_shape()[-1] Wd = tf.get_variable("Wd", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Vd = tf.get_variable("Vd", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) # batch x len_query x 1 x hidden_dim query_ = tf.expand_dims(query, 2) # batch x 1 x len_context x hidden_dim context_ = tf.expand_dims(context, 1) # batch x len_query x len_context x hidden_dim dot_attention = query_ * context_ dot_attention = tf.einsum("abcd,de->abce", dot_attention, Wd) dot_attention = tf.einsum("abce,ef->abcf", dot_attention, Vd) # batch x len_query x len_context S = tf.squeeze(dot_attention, -1) mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) return c2q, q2c
def self_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): hidden_dim = query.get_shape()[-1] Wq_1 = tf.get_variable("Wq_1", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Vq = tf.get_variable("Vq", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) Wp_1 = tf.get_variable("Wp_1", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Wp_2 = tf.get_variable("Wp_2", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) # S = tf.matmul(tf.nn.tanh(tf.matmul(query, Wq_1)), Vq_1) S = tf.nn.tanh(tf.einsum("abc,cd->abd", query, Wq_1)) S = tf.einsum("abd,de->abe", S, Vq) S = tf.squeeze(S, -1) # batch x query_len mask_q = query_mask # batch x query_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_q)) S_ = tf.expand_dims(S_, axis=-1) # batch x len x 1 query_attn = tf.reduce_sum(S_ * query, axis=1, keepdims=True) # batch x 1 x hidden_dim # batch x context_len x 1 S = tf.nn.tanh(tf.einsum("abc,cd->abd", context, Wp_1)) S += tf.nn.tanh(tf.einsum("abc,cd->abd", query_attn, Wp_1)) S = tf.nn.tanh(S) S = tf.einsum("abd,de->abe", S, Vq) S = tf.squeeze(S, -1) # batch x context_len mask_c = context_mask # batch x context_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) S_ = tf.expand_dims(S_, axis=-1) # batch x context_len x 1 context_attn = tf.reduce_sum(S_ * context, axis=1, keepdims=True) # batch x 1 x hidden_dim context_attn = tf.squeeze(context_attn, axis=1) return context_attn
def build_encoder(self, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb, word_drop_mask = self.build_emebdding(*args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_drop_mask = tf.cast(word_drop_mask, tf.float32) word_drop_mask = tf.squeeze(word_drop_mask, axis=-1) input_mask = tf.cast(input_mask, tf.float32) input_mask *= word_drop_mask word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) H_enc_2 = tf.reduce_max(qanet_layers.mask_logits( word_emb, tf.expand_dims(input_mask, -1)), axis=1) input_mask = tf.expand_dims(input_mask, -1) H_enc_1 = tf.reduce_sum(word_emb * tf.cast(input_mask, tf.float32), 1) H_enc_1 = tf.div(H_enc_1, tf.reduce_sum(input_mask, axis=1) + EPSILON) H_enc = tf.concat([H_enc_1, H_enc_2], 1) return H_enc
def query_context_attention(query, context, max_query_len, max_context_len, query_mask, context_mask, dropout_ratio, scope, reuse=None): with tf.variable_scope(scope + "_Context_to_Query_Attention_Layer", reuse=reuse): context_ = tf.transpose(context, [0, 2, 1]) hiddem_dim = query.get_shape()[-1] attn_W = tf.get_variable("AttnW", dtype=tf.float32, shape=[hiddem_dim, hiddem_dim], initializer=initializer) weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]]) S = tf.matmul(weighted_query, context_) # batch x q_len x c_len mask_q = tf.expand_dims(query_mask, 1) mask_c = tf.expand_dims(context_mask, 1) S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax( qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q)) q2c = tf.matmul(S_T, query) query_attention_outputs = tf.concat( [query, c2q, query - c2q, query * c2q], axis=-1) query_attention_outputs *= tf.expand_dims( tf.cast(query_mask, tf.float32), -1) context_attention_outputs = tf.concat( [context, q2c, context - q2c, context * q2c], axis=-1) context_attention_outputs *= tf.expand_dims( tf.cast(context_mask, tf.float32), -1) return query_attention_outputs, context_attention_outputs
def concat_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): hidden_dim = query.get_shape()[-1] Wc_1 = tf.get_variable("Wc_1", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Wc_2 = tf.get_variable("Wc_2", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Vc = tf.get_variable("Vc", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) # batch x len x hidden_dim attention_1 = tf.einsum("abc,cd->abd", query, Wc_1) attention_2 = tf.einsum("abc,cd->abd", context, Wc_2) # concat attention # batch x len_query x 1 x hidden_dim attention_1 = tf.expand_dims(attention_1, 2) # batch x 1 x len_context x hidden_dim attention_2 = tf.expand_dims(attention_2, 1) # batch x len_query x len_context x hidden_dim attention = tf.nn.tanh(attention_1+attention_2) # batch x len_query x len_context x 1 S = tf.einsum("abcd,de->abce", attention, Vc) S = tf.squeeze(S, -1) # batch x len_query x len_context mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) return c2q, q2c
def trilinear_attention(ques_emb, context_emb, ques_mask, context_mask, dropout_keep_prob, config): attention_outputs = [] C = tf.tile(tf.expand_dims(context_emb, 2), [1, 1, config.max_q_len, 1]) Q = tf.tile(tf.expand_dims(ques_emb, 1), [1, config.max_p_len, 1, 1]) S = qanet_layers.trilinear([C, Q, C * Q], input_keep_prob=1.0 - dropout_keep_prob) mask_q = tf.expand_dims(ques_mask, 1) S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(context_mask, 2) S_T = tf.transpose( tf.nn.softmax(qanet_layers.mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) c2q = tf.matmul(S_, ques_emb) # q2c = tf.matmul(tf.matmul(S_, S_T), context_emb) attention_outputs.extend([context_emb, c2q, context_emb * c2q]) if config.q2c: attention_outputs.append(context_emb * q2c) return tf.concat(attention_outputs, axis=-1)
def task_specific_attention(inputs, output_size, input_mask, initializer=layers.xavier_initializer(), activation_fn=tf.tanh, scope=None, reuse=None): """ Performs task-specific attention reduction, using learned attention context vector (constant within task of interest). self-attentive sentence embedding Args: inputs: Tensor of shape [batch_size, units, input_size] `input_size` must be static (known) `units` axis will be attended over (reduced from output) `batch_size` will be preserved output_size: Size of output's inner (feature) dimension Returns: outputs: Tensor of shape [batch_size, output_dim]. """ assert len( inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None with tf.variable_scope(scope + '_attention', reuse=reuse) as scope: print("--------------using self attention----------------") attention_context_vector = tf.get_variable( name='attention_context_vector', shape=[output_size], initializer=initializer, dtype=tf.float32) input_projection = layers.fully_connected( inputs, output_size, activation_fn=activation_fn, scope=scope) # batch x max_len x output_size vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2) # batch x max_len input_mask = tf.cast(input_mask, tf.float32) attention_weights = tf.nn.softmax( qanet_layers.mask_logits(vector_attn, mask=input_mask)) attention_weights = tf.expand_dims(attention_weights, -1) # vector_attn_max = tf.reduce_max(qanet_layers.mask_logits(vector_attn, extend_mask), axis=1) # attention_weights = tf.exp(vector_attn-vector_attn_max) * tf.cast(extend_mask, tf.float32) # batch x max_len x 1 # attention_weights = attention_weights / tf.reduce_sum(attention_weights, axis=1, keep_dims=True) # batch x max_len x 1 weighted_projection = tf.multiply(input_projection, attention_weights) outputs = tf.reduce_sum(weighted_projection, axis=1) return outputs
def build_emebdding(self, *args, **kargs): reuse = kargs["reuse"] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.embedding_lookup(self.emb_mat, self.sent_token) if self.config.with_word_drop: word_drop_rate = tf.cond(self.is_training, lambda: self.config.word_drop_rate, lambda: 0.0) word_emb, word_drop_mask = common_utils.word_dropout( word_emb, word_drop_rate) else: word_drop_mask = self.sent_token_mask entity_emb = tf.nn.embedding_lookup(self.emb_mat, self.entity_token) [_, _, entity_emb ] = layer_utils.my_lstm_layer(entity_emb, self.config.context_lstm_dim, input_lengths=self.entity_token_len, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) entity_mask = tf.expand_dims(self.entity_token_mask, axis=-1) # batch x len x 1 entity_emb = tf.reduce_max(qanet_layers.mask_logits( entity_emb, entity_mask), axis=1) entity_emb = tf.expand_dims(entity_emb, axis=1) seq_len = tf.reduce_max(self.sent_token_len) entity_emb = tf.tile(entity_emb, [1, seq_len, 1]) mask = tf.expand_dims(self.sent_token_mask, -1) word_emb = tf.concat([word_emb, entity_emb], axis=-1) word_emb *= tf.cast(mask, tf.float32) print(word_emb.get_shape(), "=====word with entity========") if self.config.with_char: char_emb = self.build_char_embedding(self.sent_char, self.sent_char_len, self.char_mat, is_training=self.is_training, reuse=reuse) word_emb = tf.concat([word_emb, char_emb], axis=-1) return word_emb, word_drop_mask
def bilinear_attention(ques_emb, context_emb, ques_mask, context_mask, dropout_keep_prob, config): attention_outputs = [] context_ = tf.transpose(context_emb, [0, 2, 1]) hiddem_dim = ques_emb.get_shape()[-1] attn_W = tf.get_variable( "AttnW", shape=[hiddem_dim, hiddem_dim], dtype=tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer( factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32)) weighted_query = tf.tensordot(ques_emb, attn_W, axes=[[2], [0]]) S = tf.matmul(weighted_query, context_) # batch x q_len x c_len mask_q = tf.expand_dims(ques_mask, 1) # batch x 1 x q_len mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x c_len S_max = tf.nn.softmax( tf.expand_dims( tf.reduce_max(qanet_layers.mask_logits(S, mask=mask_c), axis=1), 1), -1) # batch x 1 x c_len c2q = tf.matmul(S_max, context_emb) S_T = tf.nn.softmax( qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q)) # batch x c_len x q_len q2c = tf.matmul(S_T, ques_emb) # batch x c_len x c_dim attention_outputs.extend([context_emb, q2c, context_emb * q2c]) if config.q2c: attention_outputs.append(context_emb * c2q) return tf.concat(attention_outputs, axis=-1)
def query_context_attention(query, context, max_query_len, max_context_len, query_mask, context_mask, dropout_ratio, scope, reuse=None): with tf.variable_scope(scope+"_Context_to_Query_Attention_Layer", reuse=reuse): # context_ = tf.transpose(context, [0,2,1]) hiddem_dim = query.get_shape()[-1] query_ = tf.nn.l2_normalize(query, axis=-1) context_ = tf.nn.l2_normalize(context, axis=-1) # attn_W = tf.get_variable("AttnW", dtype=tf.float32, # shape=[hiddem_dim, hiddem_dim], # initializer=initializer) S = tf.matmul(query_, tf.transpose(context_, [0,2,1])) # S = tf.matmul(weighted_query, context_) # batch x q_len x c_len mask_q = tf.expand_dims(query_mask, 1) mask_c = tf.expand_dims(context_mask, 1) S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) query_attention_outputs = c2q #tf.concat([query*c2q, c2q], axis=-1) # query_attention_outputs *= tf.expand_dims(tf.cast(query_mask, tf.float32), -1) context_attention_outputs = q2c #tf.concat([context*q2c, q2c], axis=-1) # context_attention_outputs *= tf.expand_dims(tf.cast(context_mask, tf.float32), -1) # query_attention_outputs = tf.nn.dropout(query_attention_outputs, 1 - dropout_ratio) # context_attention_outputs = tf.nn.dropout(context_attention_outputs, 1 - dropout_ratio) return query_attention_outputs, context_attention_outputs
def memory_attention_v2(query, memory, query_mask, scope, memory_mask=None, reuse=None, attention_output="soft", num_heads=8, dropout_rate=0.0, threshold=0.1, apply_hard_attn=False): """ query: batch x len x query_dim memory: batch x num_calsses x mem_dim """ with tf.variable_scope(scope + "_label_attention", reuse=reuse): query_dim = query.get_shape()[-1] mem_dim = memory.get_shape()[-1] # batch x num_calsses x mem_dim # memory_ = tf.transpose(memory, [0,2,1]) attn_W = tf.get_variable("AttnW", dtype=tf.float32, shape=[query_dim, mem_dim], initializer=initializer) # bacth x len x mem_dim weighted_query = tf.einsum("abc,cd->abd", query, attn_W) S = tf.einsum("abd,ed->abe", weighted_query, memory) # batch x len x num_classes # batch x 1 x len mask_q = tf.expand_dims(query_mask, axis=1) # batch x num_classes x len S_ = tf.nn.softmax( qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q)) # batch x num_classes x dim output = tf.matmul(S_, query) print("==memory attention==", output.get_shape()) return output
def memory_attention(query, memory, query_mask, scope, memory_mask=None, reuse=None): """ query: batch x len x query_dim memory: batch x num_calsses x mem_dim """ with tf.variable_scope(scope + "_Context_to_Query_Attention_Layer", reuse=reuse): query_dim = query.get_shape()[-1] mem_dim = memory.get_shape()[-1] # batch x num_calsses x mem_dim memory_ = tf.transpose(memory, [0, 2, 1]) attn_W = tf.get_variable("AttnW", dtype=tf.float32, shape=[query_dim, mem_dim], initializer=initializer) # bacth x len x mem_dim weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]]) # batch x len x num_classes S = tf.matmul(weighted_query, memory_) # batch x 1 x len mask_q = tf.expand_dims(query_mask, axis=1) # batch x num_classes x len S_ = tf.nn.softmax( qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q)) # batch x num_classes x dim output = tf.matmul(S_, query) print(output.get_shape(), "=====") output = tf.reduce_sum(output, axis=1) return output
def memory_attention_v1(query, memory, query_mask, scope, memory_mask=None, reuse=None, attention_output="soft", num_heads=8, dropout_rate=0.0, threshold=0.1, apply_hard_attn=False): """ query: batch x len x query_dim memory: batch x num_calsses x mem_dim """ with tf.variable_scope(scope + "_label_attention", reuse=reuse): query_dim = query.get_shape()[-1] mem_dim = memory.get_shape()[-1] # batch x num_calsses x mem_dim memory_ = tf.transpose(memory, [0, 2, 1]) attn_W = tf.get_variable("AttnW", dtype=tf.float32, shape=[query_dim, mem_dim], initializer=initializer) # bacth x len x mem_dim weighted_query = tf.tensordot(query, attn_W, axes=[[2], [0]]) # batch x len x num_classes S = tf.matmul(weighted_query, memory_) # batch x 1 x len mask_q = tf.expand_dims(query_mask, axis=1) # batch x num_classes x len S_ = tf.nn.softmax( qanet_layers.mask_logits(tf.transpose(S, [0, 2, 1]), mask=mask_q)) # batch x num_classes x dim output = tf.matmul(S_, query) print(output.get_shape(), "==output shape===") if apply_hard_attn: presence_vec = output * output # batch x num_class x dim presence_vec = tf.sqrt(tf.reduce_sum(presence_vec, axis=-1)) # batch x num_class presence_vec = tf.nn.softmax(presence_vec, axis=-1) presence_mask = hard_attention_mask(presence_vec, threshold) output *= tf.expand_dims(presence_mask, -1) # presence_vec = tf.nn.softmax(presence_vec) # idx = tf.where(presence_vec > threshold) # batch_idxs = tf.range(0, tf.shape(output)[0]) # batch_idxs = tf.expand_dims(batch_idxs, 1) # idxs = tf.concat([batch_idxs, idx], 1) # output = tf.gather_nd(output, idxs) print(output.get_shape(), "==hard attention output shape===") if attention_output == "soft" and not apply_hard_attn: class_dim = memory.get_shape()[1] class_attention = tf.get_variable("class_attn", dtype=tf.float32, shape=[query_dim], initializer=initializer) # batch x num_classes attn_output = tf.reduce_sum(output * class_attention, axis=-1) attn_output = tf.softmax(attn_output) # batch x num_classes attn_output = tf.expand_dims(attn_output, axis=-1) # batch x num_classes x 1 output = tf.reduce_sum(attn_output * output, axis=1) elif attention_output == "sum" and apply_hard_attn: output = tf.reduce_sum(output, axis=1) elif attention_output == "multi_head": # get memory mask ignore_padding = (1 - presence_mask) ignore_padding = attention_bias_ignore_padding(ignore_padding) encoder_self_attention_bias = ignore_padding output = multihead_attention_texar( output, memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=num_heads, num_units=None, dropout_rate=dropout_rate, scope="multihead_attention") output = tf.reduce_sum(output, axis=1) return output
def build_interactor(self, sent1_emb, sent2_emb, sent1_len, sent2_len, sent1_mask, sent2_mask, *args, **kargs): num_lstm_layers = kargs["num_lstm_layers"] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) input_dim = sent1_emb.get_shape()[-1] with tf.variable_scope(self.config.scope + "_embed_hishway"): sent1_repres = match_utils.multi_highway_layer( sent1_emb, input_dim, self.config.highway_layer_num) tf.get_variable_scope().reuse_variables() sent2_repres = match_utils.multi_highway_layer( sent2_emb, input_dim, self.config.highway_layer_num) match_dim = self.emb_size for i in range(num_lstm_layers): with tf.variable_scope(self.config.scope + "_densely_co_attentive_{}".format(i), reuse=None): sent1_repres_, match_dim_ = self.build_encoder(sent1_repres, sent1_len, reuse=None) sent2_repres_, match_dim_ = self.build_encoder(sent2_repres, sent1_len, reuse=True) match_dim += match_dim_ print("===before=====", i, sent1_repres_.get_shape(), sent2_repres_.get_shape()) if self.config.get("co_attention", None): [query_attention, context_attention ] = drcn_utils.query_context_attention(sent1_repres_, sent2_repres_, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=None) sent1_repres = tf.concat( [sent1_repres_, query_attention, sent1_repres], axis=-1) sent2_repres = tf.concat( [sent2_repres_, context_attention, sent2_repres], axis=-1) match_dim += match_dim_ else: sent1_repres = tf.concat([sent1_repres_, sent1_repres], axis=-1) sent2_repres = tf.concat([sent2_repres_, sent2_repres], axis=-1) print("====i====", sent1_repres.get_shape(), sent2_repres.get_shape()) if np.mod(i + 1, 2) == 0 and self.config.with_auto_encoding: sent1_repres = self.auto_encoder(sent1_repres, reuse=None) sent2_repres = self.auto_encoder(sent2_repres, reuse=True) if self.config.recurrent_layer_norm: sent1_repres = tf.contrib.layers.layer_norm( sent1_repres, reuse=None, scope="lstm_layer_norm") sent2_repres = tf.contrib.layers.layer_norm( sent2_repres, reuse=True, scope="lstm_layer_norm") mask_q = tf.expand_dims(sent1_mask, -1) mask_c = tf.expand_dims(sent2_mask, -1) v_1_max = tf.reduce_max(qanet_layers.mask_logits(sent1_repres, mask_q), axis=1) v_2_max = tf.reduce_max(qanet_layers.mask_logits(sent2_repres, mask_c), axis=1) v = tf.concat([ v_1_max, v_2_max, v_1_max * v_2_max, v_1_max - v_2_max, tf.abs(v_1_max - v_2_max) ], axis=-1) v = tf.nn.dropout(v, 1 - dropout_rate) match_dim = match_dim * 5 return v_1_max, v_2_max, v, match_dim
def build_encoder(self, input_lengths, input_mask, *args, **kargs): reuse = kargs["reuse"] word_emb = self.build_emebdding(*args, **kargs) dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) word_emb = tf.nn.dropout(word_emb, 1 - dropout_rate) with tf.variable_scope(self.config.scope + "_input_highway", reuse=reuse): input_dim = word_emb.get_shape()[-1] sent_repres = match_utils.multi_highway_layer( word_emb, input_dim, self.config.highway_layer_num) if self.config.rnn == "lstm": [sent_repres_fw, sent_repres_bw, sent_repres ] = layer_utils.my_lstm_layer(sent_repres, self.config.context_lstm_dim, input_lengths=input_lengths, scope_name=self.config.scope, reuse=reuse, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) match_dim = self.config.context_lstm_dim * 6 elif self.config.rnn == "slstm": word_emb_proj = tf.layers.dense(word_emb, self.config.slstm_hidden_size) initial_hidden_states = word_emb_proj initial_cell_states = tf.identity(initial_hidden_states) [new_hidden_states, new_cell_states, dummynode_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, input_lengths, initial_hidden_states, initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=reuse) sent_repres = new_hidden_states match_dim = self.config.slstm_hidden_size * 3 if self.config.multi_head: mask = tf.cast(input_mask, tf.float32) ignore_padding = (1 - mask) ignore_padding = label_network_utils.attention_bias_ignore_padding( ignore_padding) encoder_self_attention_bias = ignore_padding sent_repres = label_network_utils.multihead_attention_texar( sent_repres, memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=8, num_units=128, dropout_rate=dropout_rate, scope="multihead_attention") v_attn = self_attn.multi_dimensional_attention( sent_repres, input_mask, 'multi_dim_attn_for_%s' % self.config.scope, 1 - dropout_rate, self.is_training, self.config.weight_decay, "relu") mask = tf.expand_dims(input_mask, -1) v_sum = tf.reduce_sum(sent_repres * tf.cast(mask, tf.float32), 1) v_ave = tf.div( v_sum, tf.expand_dims( tf.cast(input_lengths, tf.float32) + EPSILON, -1)) v_max = tf.reduce_max(qanet_layers.mask_logits(sent_repres, mask), axis=1) v_last = esim_utils.last_relevant_output(sent_repres, input_lengths) out = tf.concat([v_ave, v_max, v_last, v_attn], axis=-1) return out, match_dim
def build_interactor(self, sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, *args, **kargs): reuse = kargs["reuse"] input_dim = sent1_repres.get_shape()[-1] dropout_rate = tf.cond(self.is_training, lambda: self.config.dropout_rate, lambda: 0.0) with tf.variable_scope(self.config.scope + "_interaction_module", reuse=reuse): if self.config.with_self_attention: v_1_attn = esim_utils.multihead_attention( sent1_repres, sent1_repres, num_units=None, num_heads=self.config.num_heads, dropout_rate=dropout_rate, is_training=True, causality=False, scope="multihead_attention", reuse=None) v_2_attn = esim_utils.multihead_attention( sent2_repres, sent2_repres, num_units=None, num_heads=self.config.num_heads, dropout_rate=dropout_rate, is_training=True, causality=False, scope="multihead_attention", reuse=True) sent1_repres = tf.concat([sent1_repres, v_1_attn], axis=-1) sent2_repres = tf.concat([sent2_repres, v_2_attn], axis=-1) [query_attention_outputs, context_attention_outputs ] = esim_utils.query_context_attention(sent1_repres, sent2_repres, sent1_len, sent2_len, sent1_mask, sent2_mask, dropout_rate, self.config.scope, reuse=reuse) if self.config.rnn == "lstm": [sent1_repres_fw, sent1_repres_bw, sent1_repres ] = layer_utils.my_lstm_layer(query_attention_outputs, self.config.context_lstm_dim, input_lengths=sent1_len, scope_name=self.config.scope, reuse=None, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) [sent2_repres_fw, sent2_repres_bw, sent2_repres ] = layer_utils.my_lstm_layer(context_attention_outputs, self.config.context_lstm_dim, input_lengths=sent2_len, scope_name=self.config.scope, reuse=True, is_training=self.is_training, dropout_rate=dropout_rate, use_cudnn=self.config.use_cudnn) match_dim = self.config.context_lstm_dim * 8 elif self.config.rnn == "slstm": sent1_initial_hidden_states = tf.layers.dense( query_attention_outputs, self.config.slstm_hidden_size) sent1_initial_cell_states = tf.identity( sent1_initial_hidden_states) [ new_sent1_hidden_states, new_sent1_cell_states, dummynode_sent1_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, sent1_len, sent1_initial_hidden_states, sent1_initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=None) sent1_repres = new_sent1_hidden_states sent2_initial_hidden_states = tf.layers.dense( context_attention_outputs, self.config.slstm_hidden_size) sent2_initial_cell_states = tf.identity( sent2_initial_hidden_states) [ new_sent2_hidden_states, new_sent2_cell_states, dummynode_sent2_hidden_states ] = slstm_utils.slstm_cell(self.config, self.config.scope, self.config.slstm_hidden_size, sent2_len, sent2_initial_hidden_states, sent2_initial_cell_states, self.config.slstm_layer_num, dropout_rate, reuse=True) sent2_repres = new_sent2_hidden_states match_dim = self.config.slstm_hidden_size * 4 v_1_sum = tf.reduce_sum(sent1_repres, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(sent1_len, tf.float32) + EPSILON, -1)) v_2_sum = tf.reduce_sum(sent2_repres, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(sent2_len, tf.float32) + EPSILON, -1)) # v_1_max = tf.reduce_max(sent1_repres, 1) # v_2_max = tf.reduce_max(sent2_repres, 1) mask_q = tf.expand_dims(sent1_mask, -1) mask_c = tf.expand_dims(sent2_mask, -1) v_1_max = tf.reduce_max(qanet_layers.mask_logits( sent1_repres, mask_q), axis=1) v_2_max = tf.reduce_max(qanet_layers.mask_logits( sent2_repres, mask_c), axis=1) out1 = tf.concat([v_1_ave, v_1_max], axis=-1) out2 = tf.concat([v_2_ave, v_2_max], axis=-1) out = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) return out1, out2, out, match_dim