def _get_top_k(scores1, scores2, k, max_span_size, support2question): max_support_length = tf.shape(scores1)[1] doc_idx, pointer1, topk_scores1 = segment_top_k(scores1, support2question, k) # [num_questions * topk] doc_idx_flat = tf.reshape(doc_idx, [-1]) pointer_flat1 = tf.reshape(pointer1, [-1]) # [num_questions * topk, support_length] scores_gathered2 = tf.gather(scores2, doc_idx_flat) if max_span_size < 0: pointer_flat1, max_span_size = pointer_flat1 + max_span_size + 1, -max_span_size left_mask = misc.mask_for_lengths(tf.cast(pointer_flat1, tf.int32), max_support_length, mask_right=False) right_mask = misc.mask_for_lengths( tf.cast(pointer_flat1 + max_span_size, tf.int32), max_support_length) scores_gathered2 = scores_gathered2 + left_mask + right_mask pointer2 = tf.argmax(scores_gathered2, axis=1, output_type=tf.int32) topk_score2 = tf.gather_nd(scores2, tf.stack([doc_idx_flat, pointer2], 1)) return doc_idx, pointer1, tf.reshape( pointer2, [-1, k]), topk_scores1 + tf.reshape(topk_score2, [-1, k])
def eval(): # [num_questions * beam_size, support_length] left_mask = misc.mask_for_lengths(tf.cast(start_pointer, tf.int32), max_support_length, mask_right=False) right_mask = misc.mask_for_lengths(tf.cast(start_pointer + max_span_size, tf.int32), max_support_length) masked_end_scores = end_scores + left_mask + right_mask predicted_ends = tf.argmax(masked_end_scores, axis=1, output_type=tf.int32) return (start_scores, masked_end_scores, tf.gather(doc_idx_for_support, doc_idx_flat), start_pointer, predicted_ends)
def bidaf_layer(seq1, seq1_length, seq2, seq2_length, seq1_to_seq2=None, seq2_to_seq1=None): """Encodes seq1 conditioned on seq2, e.g., using word-by-word attention.""" attn_scores, attn_probs, seq2_weighted = attention.diagonal_bilinear_attention( seq1, seq2, seq2_length, False, seq2_to_seq1=seq2_to_seq1) attn_scores += tf.expand_dims( mask_for_lengths(seq1_length, tf.shape(attn_scores)[1]), 2) max_seq1 = tf.reduce_max(attn_scores, 2) if seq1_to_seq2 is None: seq1_attention = tf.nn.softmax(max_seq1, 1) else: segm_max_seq1 = tf.unsorted_segment_max( max_seq1, seq1_to_seq2, tf.reduce_max(seq1_to_seq2) + 1) seq1_attention = tf.nn.softmax(segm_max_seq1, 1) seq1_attention = tf.gather(seq1_attention, seq1_to_seq2) seq1_attention.set_shape(max_seq1.get_shape()) seq1_weighted = tf.einsum('ij,ijk->ik', seq1_attention, seq1) seq1_weighted = tf.expand_dims(seq1_weighted, 1) seq1_weighted = tf.tile(seq1_weighted, [1, tf.shape(seq1)[1], 1]) return tf.concat( [seq2_weighted, seq1 * seq2_weighted, seq1 * seq1_weighted], 2)
def apply_attention(attn_scores, states, length, is_self=False, with_sentinel=True, reuse=False, seq2_to_seq1=None): attn_scores += tf.expand_dims( misc.mask_for_lengths(length, tf.shape(attn_scores)[2]), 1) softmax = tf.nn.softmax if seq2_to_seq1 is None else lambda x: segment.segment_softmax( x, seq2_to_seq1) if is_self: # exclude attending to state itself attn_scores += tf.expand_dims( tf.diag(tf.fill([tf.shape(attn_scores)[1]], -1e6)), 0) if with_sentinel: with tf.variable_scope('sentinel', reuse=reuse): s = tf.get_variable('score', [1, 1, 1], tf.float32, tf.zeros_initializer()) s = tf.tile(s, [tf.shape(attn_scores)[0], tf.shape(attn_scores)[1], 1]) attn_probs = softmax(tf.concat([s, attn_scores], 2)) attn_probs = attn_probs[:, :, 1:] else: attn_probs = softmax(attn_scores) attn_states = tf.einsum('abd,adc->abc', attn_probs, states) if seq2_to_seq1 is not None: attn_states = tf.unsorted_segment_sum(attn_states, seq2_to_seq1, tf.reduce_max(seq2_to_seq1) + 1) return attn_scores, attn_probs, attn_states
def bilinear_answer_layer(size, encoded_question, question_length, encoded_support, support_length, support2question, answer2support, is_eval, topk=1, max_span_size=10000): """Answer layer for multiple paragraph QA.""" # computing single time attention over question size = encoded_support.get_shape()[-1].value question_state = compute_question_state(encoded_question, question_length) # compute logits hidden = tf.gather( tf.layers.dense(question_state, 2 * size, name="hidden"), support2question) hidden_start, hidden_end = tf.split(hidden, 2, 1) support_mask = misc.mask_for_lengths(support_length) start_scores = tf.einsum('ik,ijk->ij', hidden_start, encoded_support) start_scores = start_scores + support_mask end_scores = tf.einsum('ik,ijk->ij', hidden_end, encoded_support) end_scores = end_scores + support_mask return compute_spans(start_scores, end_scores, answer2support, is_eval, support2question, topk, max_span_size)
def compute_question_state(encoded_question, question_length): attention_scores = tf.layers.dense(encoded_question, 1, name="question_attention") q_mask = misc.mask_for_lengths(question_length) attention_scores = attention_scores + tf.expand_dims(q_mask, 2) question_attention_weights = tf.nn.softmax(attention_scores, 1, name="question_attention_weights") question_state = tf.reduce_sum(question_attention_weights * encoded_question, 1) return question_state
def mlp_answer_layer(size, encoded_question, question_length, encoded_support, support_length, support2question, answer2support, is_eval, beam_size=1, max_span_size=10000): """Answer layer for multiple paragraph QA.""" # computing single time attention over question question_state = compute_question_state(encoded_question, question_length) # compute logits static_input = tf.concat([tf.gather(tf.expand_dims(question_state, 1), support2question) * encoded_support, encoded_support], 2) hidden = tf.gather(tf.layers.dense(question_state, 2 * size, name="hidden_1"), support2question) hidden = tf.layers.dense( static_input, 2 * size, use_bias=False, name="hidden_2") + tf.expand_dims(hidden, 1) hidden_start, hidden_end = tf.split(tf.nn.relu(hidden), 2, 2) support_mask = misc.mask_for_lengths(support_length) start_scores = tf.layers.dense(hidden_start, 1, use_bias=False, name="start_scores") start_scores = tf.squeeze(start_scores, [2]) start_scores = start_scores + support_mask end_scores = tf.layers.dense(hidden_end, 1, use_bias=False, name="end_scores") end_scores = tf.squeeze(end_scores, [2]) end_scores = end_scores + support_mask return compute_spans(start_scores, end_scores, answer2support, is_eval, support2question, beam_size, max_span_size)
def bidaf_answer_layer(encoded_support_start, encoded_support_end, support_length, support2question, answer2support, is_eval, beam_size=1, max_span_size=10000): # BiLSTM(M) = M^2 = encoded_support_end start_scores = tf.squeeze(tf.layers.dense(encoded_support_start, 1, use_bias=False), 2) end_scores = tf.squeeze(tf.layers.dense(encoded_support_end, 1, use_bias=False), 2) # mask out-of-bounds slots by adding -1000 support_mask = misc.mask_for_lengths(support_length) start_scores = start_scores + support_mask end_scores = end_scores + support_mask return compute_spans(start_scores, end_scores, answer2support, is_eval, support2question, beam_size=beam_size, max_span_size=max_span_size)
def conv_char_embedding_alt(char_vocab, size, unique_word_chars, unique_word_lengths, word_to_uniqs, conv_width=5, emb_initializer=tf.random_normal_initializer( 0.0, 0.1), scope=None): # "fixed PADDING on character level" pad = tf.zeros( tf.stack( [tf.shape(unique_word_lengths)[0], math.floor(conv_width / 2)]), tf.int32) unique_word_chars = tf.concat([pad, unique_word_chars, pad], 1) if not isinstance(word_to_uniqs, list): word_to_uniqs = [word_to_uniqs] with tf.variable_scope(scope or "char_embeddings") as vs: char_embedding_matrix = \ tf.get_variable("char_embedding_matrix", shape=(len(char_vocab), size), initializer=emb_initializer, trainable=True) max_word_length = tf.reduce_max(unique_word_lengths) embedded_chars = tf.nn.embedding_lookup( char_embedding_matrix, tf.cast(unique_word_chars, tf.int32)) with tf.variable_scope("conv"): # create filter like this to get fan-in and fan-out right for initializers depending on those filter = tf.get_variable("filter", [conv_width * size, size]) filter_reshaped = tf.reshape(filter, [conv_width, size, size]) # [B, T, S + pad_right] conv_out = tf.nn.conv1d(embedded_chars, filter_reshaped, 1, "VALID") conv_mask = tf.expand_dims( misc.mask_for_lengths(unique_word_lengths, max_length=max_word_length), 2) conv_out = conv_out + conv_mask unique_embedded_words = tf.reduce_max(conv_out, [1]) all_embedded = [] for word_idx in word_to_uniqs: flat_word_idx = tf.reshape(word_idx, [-1]) embedded_words = tf.gather(unique_embedded_words, flat_word_idx) embedded_words = tf.reshape( embedded_words, tf.stack([-1, tf.unstack(tf.shape(word_idx))[1], size])) all_embedded.append(embedded_words) return all_embedded
def conv_char_embedding(num_chars, size, word_chars, word_lengths, word_sequences=None, conv_width=5, emb_initializer=tf.random_normal_initializer(0.0, 0.1), scope=None): """Build simple convolutional character based embeddings for words with a fixed filter and size. After the convolution max-pooling over characters is employed for each filter. If word sequences are given, these will be embedded with the newly created embeddings. """ # "fixed PADDING on character level" pad = tf.zeros(tf.stack([tf.shape(word_lengths)[0], conv_width // 2]), tf.int32) word_chars = tf.concat([pad, word_chars, pad], 1) with tf.variable_scope(scope or "char_embeddings"): char_embedding_matrix = \ tf.get_variable("char_embedding_matrix", shape=(num_chars, size), initializer=emb_initializer, trainable=True) max_word_length = tf.reduce_max(word_lengths) embedded_chars = tf.nn.embedding_lookup(char_embedding_matrix, tf.cast(word_chars, tf.int32)) with tf.variable_scope("conv"): # create filter like this to get fan-in and fan-out right for initializers depending on those filter = tf.get_variable("filter", [conv_width * size, size]) filter_reshaped = tf.reshape(filter, [conv_width, size, size]) # [B, T, S + pad_right] conv_out = tf.nn.conv1d(embedded_chars, filter_reshaped, 1, "VALID") conv_mask = tf.expand_dims( misc.mask_for_lengths(word_lengths, max_length=max_word_length), 2) conv_out = conv_out + conv_mask embedded_words = tf.reduce_max(conv_out, 1) if word_sequences is None: return embedded_words if not isinstance(word_sequences, list): word_sequences = [word_sequences] all_embedded = [] for word_idxs in word_sequences: all_embedded.append(tf.nn.embedding_lookup(embedded_words, word_idxs)) return all_embedded
def train(): gathered_end_scores = tf.gather(end_scores, answer2support) gathered_start_scores = tf.gather(start_scores, answer2support) if correct_start is not None: # assuming we know the correct start we only consider ends after that left_mask = misc.mask_for_lengths(tf.cast(correct_start, tf.int32), max_support_length, mask_right=False) gathered_end_scores = gathered_end_scores + left_mask predicted_start_pointer = tf.argmax(gathered_start_scores, axis=1, output_type=tf.int32) predicted_end_pointer = tf.argmax(gathered_end_scores, axis=1, output_type=tf.int32) return (start_scores, end_scores, tf.gather(doc_idx_for_support, answer2support), predicted_start_pointer, predicted_end_pointer)
def bidaf_layer(seq1, seq1_length, seq2, seq2_length): """Encodes seq1 conditioned on seq2, e.g., using word-by-word attention.""" attn_scores, attn_probs, seq2_weighted = attention.diagonal_bilinear_attention( seq1, seq2, seq2_length, False) attn_scores += tf.expand_dims( mask_for_lengths(seq1_length, tf.shape(attn_scores)[1]), 2) max_seq1 = tf.reduce_max(attn_scores, 2) seq1_attention = tf.nn.softmax(max_seq1, 1) seq1_weighted = tf.einsum('ij,ijk->ik', seq1_attention, seq1) seq1_weighted = tf.expand_dims(seq1_weighted, 1) seq1_weighted = tf.tile(seq1_weighted, [1, tf.shape(seq1)[1], 1]) return tf.concat( [seq2_weighted, seq1 * seq2_weighted, seq1 * seq1_weighted], 2)
def conv_char_embedding_multi_filter( num_chars, filter_sizes, embedding_size, word_chars, word_lengths, word_sequences=None, emb_initializer=tf.random_normal_initializer(0.0, 0.1), projection_size=None, scope=None): """Build convolutional character based embeddings for words with multiple filters. Filter sizes is a list and each the position of each size in the list entry refers to its corresponding conv width. It can also be 0 (i.e., no filter of that conv width). E.g., sizes [4, 0, 7, 8] will create 4 conv filters of width 1, no filter of width 2, 7 of width 3 and 8 of width 4. After the convolution max-pooling over characters is employed for each filter. embedding_size refers to the size of the character embeddings and projection size, if given, to the final size of the embedded characters after a final projection. If it is None, no projection will be applied and the resulting size is the sum of all filters. If word sequences are given, these will be embedded with the newly created embeddings. """ with tf.variable_scope(scope or "char_embeddings"): char_embedding_matrix = \ tf.get_variable("char_embedding_matrix", shape=(num_chars, embedding_size), initializer=emb_initializer, trainable=True) pad = tf.zeros( tf.stack([tf.shape(word_lengths)[0], len(filter_sizes) // 2]), tf.int32) word_chars = tf.concat([pad, word_chars, pad], 1) max_word_length = tf.reduce_max(word_lengths) embedded_chars = tf.nn.embedding_lookup(char_embedding_matrix, tf.cast(word_chars, tf.int32)) conv_mask = tf.expand_dims( misc.mask_for_lengths(word_lengths, max_length=max_word_length), 2) embedded_words = [] for i, size in enumerate(filter_sizes): if size == 0: continue conv_width = i + 1 with tf.variable_scope("conv_%d" % conv_width): # create filter like this to get fan-in and fan-out right for initializers depending on those filter = tf.get_variable("filter", [conv_width * embedding_size, size]) filter_reshaped = tf.reshape( filter, [conv_width, embedding_size, size]) cut = len(filter_sizes) // 2 - conv_width // 2 embedded_chars_conv = embedded_chars[:, cut: -cut, :] if cut else embedded_chars conv_out = tf.nn.conv1d(embedded_chars_conv, filter_reshaped, 1, "VALID") conv_out += conv_mask embedded_words.append(tf.reduce_max(conv_out, 1)) embedded_words = tf.concat(embedded_words, 1) if projection_size is not None: embedded_words = tf.layers.dense(embedded_words, projection_size) if word_sequences is None: return embedded_words if not isinstance(word_sequences, list): word_sequences = [word_sequences] all_embedded = [] for word_idxs in word_sequences: embedded_words = tf.nn.embedding_lookup(embedded_words, word_idxs) all_embedded.append(embedded_words) return all_embedded
def create_output(self, shared_vocab_config, emb_question, question_length, emb_support, support_length, unique_word_chars, unique_word_char_length, question_words2unique, support_words2unique, answer2question, keep_prob, is_eval): # 1. char embeddings + word embeddings # 2a. conv char embeddings # 2b. pool char embeddings # 3. cat + highway # 4. BiLSTM # 5. cat # 6. biattention # 6a. create matrix of question support attentions # 6b. generate feature matrix # 7. combine # 8. BiLSTM # 9. double cross-entropy loss with tf.variable_scope( "bidaf", initializer=tf.contrib.layers.xavier_initializer()): # Some helpers max_question_length = tf.reduce_max(question_length) max_support_length = tf.reduce_max(support_length) beam_size = 1 beam_size = tf.cond(is_eval, lambda: tf.constant(beam_size, tf.int32), lambda: tf.constant(1, tf.int32)) input_size = shared_vocab_config.config["repr_dim_input"] size = shared_vocab_config.config["repr_dim"] with_char_embeddings = shared_vocab_config.config.get( "with_char_embeddings", False) W = tf.get_variable("biattention_weight", [size * 6]) W_start_index = tf.get_variable("start_index_weight", [size * 10]) W_end_index = tf.get_variable("end_index_weight", [size * 10]) # 1. char embeddings + word embeddings # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) # 1. + 2a. + 2b. 2a. char embeddings + conv + max pooling # compute combined embeddings [char_emb_question, char_emb_support] = conv_char_embedding_alt( shared_vocab_config.char_vocab, size, unique_word_chars, unique_word_char_length, [question_words2unique, support_words2unique]) # 3. cat emb_question = tf.concat([emb_question, char_emb_question], 2) emb_support = tf.concat([emb_support, char_emb_support], 2) input_size += size # highway layer to allow for interaction between concatenated embeddings # 3. highway # following bidaf notation here (qq=question, xx=support) highway_question = highway_network(emb_question, 2, scope='question_highway') highway_support = highway_network(emb_support, 2, scope='support_highway') # emb_question = tf.slice(highway_question, [0, 0, 0], tf.stack([-1, max_question_length, -1])) # emb_support = tf.slice(all_embedded_hw, tf.stack([0, max_question_length, 0]), [-1, -1, -1]) # emb_question.set_shape([None, None, size]) # emb_support.set_shape([None, None, size]) # 4. BiLSTM cell1 = tf.contrib.rnn.LSTMBlockFusedCell(size) encoded_question = fused_birnn(cell1, highway_question, question_length, dtype=tf.float32, time_major=False, scope='question_encoding')[0] encoded_question = tf.concat(encoded_question, 2) cell2 = tf.contrib.rnn.LSTMBlockFusedCell(size) encoded_support = fused_birnn(cell2, highway_support, support_length, dtype=tf.float32, time_major=False, scope='support_encoding')[0] encoded_support = tf.concat(encoded_support, 2) # 6. biattention alpha(U, H) = S # S = W^T*[H; U; H*U] # question = U = [batch, 2*embedding, length1] # support = H = [batch, 2*embedding, length2] # -> expand with features # we want to get from [length 1] and [length 2] to [length1, length2] and [length1, length2] # we do that with # (a) expand dim # [batch, L2, 2*embedding ] -> [batch, 1, L2 2*embedding] support = tf.expand_dims(encoded_support, 1) # [batch, L1, 2*embedding] -> [batch, L1, 1, 2*embedding] question = tf.expand_dims(encoded_question, 2) # (b) tile with the other dimension support = tf.tile(support, [1, max_question_length, 1, 1]) question = tf.tile(question, [1, 1, max_support_length, 1]) # 5. cat # question = U = [batch, length1, length2, 2*embeddings] # support = H = [batch, length1, length2, 2*embeddings] # S = W^T*[H; U; H*U] features = tf.concat([support, question, question * support], 3) # 6. biattention # 6a. create matrix of question support attentions # features = [batch, length1, length2, 6*embeddings] # w = [6*embeddings] # S = attention matrix = [batch, length1, length2] S = tf.einsum('ijkl,l->ijk', features, W) # S = [batch, length1, length2] # question to support attention # softmax -> [ batch, length1, length2] = att_question att_question = tf.nn.softmax(S, 2) # softmax over support # weighted = [batch, length1, length2] * [batch, length1, length2, 2*embedding] -> [batch, length2, 2*embedding] question_weighted = tf.einsum('ijk,ijkl->ikl', att_question, question) # support to question attention # 1. filter important context words with max # 2. softmax over question to get the question words which are most relevant for the most relevant context words # max(S) = [batch, length1, length2] -> [ batch, length1] = most important context max_support = tf.reduce_max(S, 2) # softmax over question -> [batch, length1] support_attention = tf.nn.softmax(max_support, 1) # support attention * support = weighted support # [batch, length1] * [batch, length1, length2, 2*embedding] = [batch, 2*embedding] support_weighted = tf.einsum('ij,ijkl->il', support_attention, support) # tile to have the same dimension # [batch, 2*embedding] -> [batch, length2, 2*embedding] support_weighted = tf.expand_dims(support_weighted, 1) support_weighted = tf.tile(support_weighted, [1, max_support_length, 1]) # 6b. generate feature matrix # G(support, weighted question, weighted support) = G(h, *u, *h) = [h, *u, mul(h, *u), mul(h, h*)] = [batch, length2, embedding*8] G = tf.concat([ encoded_support, question_weighted, encoded_support * question_weighted, encoded_support * support_weighted ], 2) # 8. BiLSTM(G) = M # start_index = M cell3 = tf.contrib.rnn.LSTMBlockFusedCell(size) start_index = \ fused_birnn(cell3, G, support_length, dtype=tf.float32, time_major=False, scope='start_index')[0] start_index = tf.concat(start_index, 2) start_index = tf.concat([start_index, G], 2) # BiLSTM(M) = M^2 = end_index cell4 = tf.contrib.rnn.LSTMBlockFusedCell(size) end_index = \ fused_birnn(cell4, start_index, support_length, dtype=tf.float32, time_major=False, scope='end_index')[ 0] end_index = tf.concat(end_index, 2) end_index = tf.concat([end_index, G], 2) # 9. double cross-entropy loss (actually applied after this function) # 9a. prepare logits # 9b. prepare argmax for output module # 9a. prepare logits # start_index = [batch, length2, 10*embedding] # W_start_index = [10*embedding] # start_index *w_start_index = start_scores # [batch, length2, 10*embedding] * [10*embedding] = [batch, length2] start_scores = tf.einsum('ijk,k->ij', start_index, W_start_index) # end_index = [batch, length2, 10*emb] # W_end_index = [10*emb] # end_index *w_end_index = start_scores # [batch, length2, 10*emb] * [10*emb] = [batch, length2] end_scores = tf.einsum('ijk,k->ij', end_index, W_end_index) # mask out-of-bounds slots by adding -1000 support_mask = mask_for_lengths(support_length) start_scores = start_scores + support_mask end_scores = end_scores + support_mask # 9b. prepare argmax for output module predicted_start_pointer = tf.argmax(start_scores) predicted_end_pointer = tf.argmax(end_scores) span = tf.concat([ tf.expand_dims(predicted_start_pointer, 1), tf.expand_dims(predicted_end_pointer, 1) ], 1) return start_scores, end_scores, span
def fastqa_answer_layer(size, encoded_question, question_length, encoded_support, support_length, correct_start, answer2question, is_eval, beam_size=1): beam_size = tf.cond(is_eval, lambda: tf.constant(beam_size, tf.int32), lambda: tf.constant(1, tf.int32)) batch_size = tf.shape(question_length)[0] answer2question = tf.cond(is_eval, lambda: tf.range(0, batch_size, dtype=tf.int32), lambda: answer2question) input_size = encoded_support.get_shape()[-1].value support_states_flat = tf.reshape(encoded_support, [-1, input_size]) # computing single time attention over question attention_scores = tf.contrib.layers.fully_connected(encoded_question, 1, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="question_attention") q_mask = misc.mask_for_lengths(question_length) attention_scores = attention_scores + tf.expand_dims(q_mask, 2) question_attention_weights = tf.nn.softmax(attention_scores, 1, name="question_attention_weights") question_state = tf.reduce_sum(question_attention_weights * encoded_question, [1]) # Prediction # start start_input = tf.concat([tf.expand_dims(question_state, 1) * encoded_support, encoded_support], 2) q_start_inter = tf.contrib.layers.fully_connected(question_state, size, activation_fn=None, weights_initializer=None, scope="q_start_inter") q_start_state = tf.contrib.layers.fully_connected(start_input, size, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="q_start") + tf.expand_dims(q_start_inter, 1) start_scores = tf.contrib.layers.fully_connected(tf.nn.relu(q_start_state), 1, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="start_scores") start_scores = tf.squeeze(start_scores, [2]) support_mask = misc.mask_for_lengths(support_length) start_scores = start_scores + support_mask # probs are needed during beam search start_probs = tf.nn.softmax(start_scores) predicted_start_probs, predicted_start_pointer = tf.nn.top_k(start_probs, beam_size) # use correct start during training, because p(end|start) should be optimized predicted_start_pointer = tf.gather(predicted_start_pointer, answer2question) predicted_start_probs = tf.gather(predicted_start_probs, answer2question) start_pointer = tf.cond(is_eval, lambda: predicted_start_pointer, lambda: tf.expand_dims(correct_start, 1)) # flatten again start_pointer = tf.reshape(start_pointer, [-1]) answer2questionwithbeam = tf.reshape(tf.tile(tf.expand_dims(answer2question, 1), tf.stack([1, beam_size])), [-1]) offsets = tf.cast(tf.range(0, batch_size) * tf.reduce_max(support_length), dtype=tf.int32) offsets = tf.gather(offsets, answer2questionwithbeam) u_s = tf.gather(support_states_flat, start_pointer + offsets) start_scores = tf.gather(start_scores, answer2questionwithbeam) start_input = tf.gather(start_input, answer2questionwithbeam) encoded_support = tf.gather(encoded_support, answer2questionwithbeam) question_state = tf.gather(question_state, answer2questionwithbeam) support_mask = tf.gather(support_mask, answer2questionwithbeam) # end end_input = tf.concat([tf.expand_dims(u_s, 1) * encoded_support, start_input], 2) q_end_inter = tf.contrib.layers.fully_connected(tf.concat([question_state, u_s], 1), size, activation_fn=None, weights_initializer=None, scope="q_end_inter") q_end_state = tf.contrib.layers.fully_connected(end_input, size, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="q_end") + tf.expand_dims(q_end_inter, 1) end_scores = tf.contrib.layers.fully_connected(tf.nn.relu(q_end_state), 1, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="end_scores") end_scores = tf.squeeze(end_scores, [2]) end_scores = end_scores + support_mask def mask_with_start(scores): return scores + misc.mask_for_lengths(tf.cast(start_pointer, tf.int32), tf.reduce_max(support_length), mask_right=False) end_scores = tf.cond(is_eval, lambda: mask_with_start(end_scores), lambda: end_scores) # probs are needed during beam search end_probs = tf.nn.softmax(end_scores) predicted_end_probs, predicted_end_pointer = tf.nn.top_k(end_probs, 1) predicted_end_probs = tf.reshape(predicted_end_probs, tf.stack([-1, beam_size])) predicted_end_pointer = tf.reshape(predicted_end_pointer, tf.stack([-1, beam_size])) predicted_idx = tf.cast(tf.argmax(predicted_start_probs * predicted_end_probs, 1), tf.int32) predicted_idx = tf.stack([tf.range(0, tf.shape(answer2question)[0], dtype=tf.int32), predicted_idx], 1) predicted_start_pointer = tf.gather_nd(predicted_start_pointer, predicted_idx) predicted_end_pointer = tf.gather_nd(predicted_end_pointer, predicted_idx) return start_scores, end_scores, predicted_start_pointer, predicted_end_pointer
def san_answer_layer(size, encoded_question, question_length, encoded_support, support_length, support2question, answer2support, is_eval, topk=1, max_span_size=10000, num_steps=5, dropout=0.4, **kwargs): question_state = compute_question_state(encoded_question, question_length) question_state = tf.layers.dense(question_state, encoded_support.get_shape()[-1].value, tf.tanh) question_state = tf.gather(question_state, support2question) cell = tf.contrib.rnn.GRUBlockCell(size) all_start_scores = [] all_end_scores = [] support_mask = misc.mask_for_lengths(support_length) for i in range(num_steps): with tf.variable_scope('SAN', reuse=i > 0): question_state = tf.expand_dims(question_state, 1) support_attn = attention.bilinear_attention( question_state, encoded_support, support_length, False, False)[2] question_state = tf.squeeze(question_state, 1) support_attn = tf.squeeze(support_attn, 1) question_state = cell(support_attn, question_state)[0] hidden_start = tf.layers.dense(question_state, size, name="hidden_start") start_scores = tf.einsum('ik,ijk->ij', hidden_start, encoded_support) start_scores = start_scores + support_mask start_probs = segment_softmax(start_scores, support2question) start_states = tf.einsum('ij,ijk->ik', start_probs, encoded_support) start_states = tf.unsorted_segment_sum( start_states, support2question, tf.shape(question_length)[0]) start_states = tf.gather(start_states, support2question) hidden_end = tf.layers.dense(tf.concat( [question_state, start_states], 1), size, name="hidden_end") end_scores = tf.einsum('ik,ijk->ij', hidden_end, encoded_support) end_scores = end_scores + support_mask all_start_scores.append(start_scores) all_end_scores.append(end_scores) all_start_scores = tf.stack(all_start_scores) all_end_scores = tf.stack(all_end_scores) dropout_mask = tf.nn.dropout(tf.ones([num_steps, 1, 1]), 1.0 - dropout) all_start_scores = tf.cond(is_eval, lambda: all_start_scores * dropout_mask, lambda: all_start_scores) all_end_scores = tf.cond(is_eval, lambda: all_end_scores * dropout_mask, lambda: all_end_scores) start_scores = tf.reduce_mean(all_start_scores, axis=0) end_scores = tf.reduce_mean(all_end_scores, axis=0) return compute_spans(start_scores, end_scores, answer2support, is_eval, support2question, topk=topk, max_span_size=max_span_size)
def conditional_answer_layer(size, encoded_question, question_length, encoded_support, support_length, correct_start, support2question, answer2support, is_eval, beam_size=1, max_span_size=10000, bilinear=False): question_state = compute_question_state(encoded_question, question_length) question_state = tf.gather(question_state, support2question) # Prediction # start if bilinear: hidden_start = tf.layers.dense(question_state, size, name="hidden_start") start_scores = tf.einsum('ik,ijk->ij', hidden_start, encoded_support) else: static_input = tf.concat([tf.expand_dims(question_state, 1) * encoded_support, encoded_support], 2) hidden_start = tf.layers.dense(question_state, size, name="hidden_start_1") hidden_start = tf.layers.dense( static_input, size, use_bias=False, name="hidden_start_2") + tf.expand_dims(hidden_start, 1) start_scores = tf.layers.dense(tf.nn.relu(hidden_start), 1, use_bias=False, name="start_scores") start_scores = tf.squeeze(start_scores, [2]) support_mask = misc.mask_for_lengths(support_length) start_scores = start_scores + support_mask max_support_length = tf.shape(start_scores)[1] _, _, num_doc_per_question = tf.unique_with_counts(support2question) offsets = tf.cumsum(num_doc_per_question, exclusive=True) doc_idx_for_support = tf.range(tf.shape(support2question)[0]) - tf.gather(offsets, support2question) doc_idx, start_pointer = tf.cond( is_eval, lambda: segment_top_k(start_scores, support2question, beam_size)[:2], lambda: (tf.expand_dims(answer2support, 1), tf.expand_dims(correct_start, 1))) doc_idx_flat = tf.reshape(doc_idx, [-1]) start_pointer = tf.reshape(start_pointer, [-1]) start_state = tf.gather_nd(encoded_support, tf.stack([doc_idx_flat, start_pointer], 1)) start_state.set_shape([None, size]) encoded_support_gathered = tf.gather(encoded_support, doc_idx_flat) question_state = tf.gather(question_state, doc_idx_flat) if bilinear: hidden_end = tf.layers.dense(tf.concat([question_state, start_state], 1), size, name="hidden_end") end_scores = tf.einsum('ik,ijk->ij', hidden_end, encoded_support_gathered) else: end_input = tf.concat([tf.expand_dims(start_state, 1) * encoded_support_gathered, tf.gather(static_input, doc_idx_flat)], 2) hidden_end = tf.layers.dense(tf.concat([question_state, start_state], 1), size, name="hidden_end_1") hidden_end = tf.layers.dense( end_input, size, use_bias=False, name="hidden_end_2") + tf.expand_dims(hidden_end, 1) end_scores = tf.layers.dense(tf.nn.relu(hidden_end), 1, use_bias=False, name="end_scores") end_scores = tf.squeeze(end_scores, [2]) end_scores = end_scores + tf.gather(support_mask, doc_idx_flat) def train(): predicted_end_pointer = tf.argmax(end_scores, axis=1, output_type=tf.int32) return start_scores, end_scores, doc_idx, start_pointer, predicted_end_pointer def eval(): # [num_questions * beam_size, support_length] left_mask = misc.mask_for_lengths(tf.cast(start_pointer, tf.int32), max_support_length, mask_right=False) right_mask = misc.mask_for_lengths(tf.cast(start_pointer + max_span_size, tf.int32), max_support_length) masked_end_scores = end_scores + left_mask + right_mask predicted_ends = tf.argmax(masked_end_scores, axis=1, output_type=tf.int32) return (start_scores, masked_end_scores, tf.gather(doc_idx_for_support, doc_idx_flat), start_pointer, predicted_ends) return tf.cond(is_eval, eval, train)
def create_output(self, shared_vocab_config, emb_question, question_length, emb_support, support_length, unique_word_chars, unique_word_char_length, question_words2unique, support_words2unique, word_in_question, correct_start, answer2question, keep_prob, is_eval): """FastQA model. Args: shared_vocab_config: has at least a field config (dict) with keys "rep_dim", "rep_dim_input" emb_question: [Q, L_q, N] question_length: [Q] emb_support: [Q, L_s, N] support_length: [Q] unique_word_chars unique_word_char_length question_words2unique support_words2unique word_in_question: [Q, L_s] correct_start: [A], only during training, i.e., is_eval=False answer2question: [A], only during training, i.e., is_eval=False keep_prob: [] is_eval: [] Returns: start_scores [B, L_s, N], end_scores [B, L_s, N], span_prediction [B, 2] """ with tf.variable_scope("fast_qa", initializer=tf.contrib.layers.xavier_initializer()): # Some helpers batch_size = tf.shape(question_length)[0] max_question_length = tf.reduce_max(question_length) support_mask = misc.mask_for_lengths(support_length) question_binary_mask = misc.mask_for_lengths(question_length, mask_right=False, value=1.0) input_size = shared_vocab_config.config["repr_dim_input"] size = shared_vocab_config.config["repr_dim"] with_char_embeddings = shared_vocab_config.config.get("with_char_embeddings", False) # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) if with_char_embeddings: # compute combined embeddings [char_emb_question, char_emb_support] = conv_char_embedding_alt( shared_vocab_config.char_vocab, size, unique_word_chars, unique_word_char_length, [question_words2unique, support_words2unique]) emb_question = tf.concat([emb_question, char_emb_question], 2) emb_support = tf.concat([emb_support, char_emb_support], 2) input_size += size # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) # compute encoder features question_features = tf.ones(tf.stack([batch_size, max_question_length, 2])) v_wiqw = tf.get_variable("v_wiq_w", [1, 1, input_size], initializer=tf.constant_initializer(1.0)) wiq_w = tf.matmul(emb_question * v_wiqw, emb_support, adjoint_b=True) wiq_w = wiq_w + tf.expand_dims(support_mask, 1) wiq_w = tf.reduce_sum(tf.nn.softmax(wiq_w) * tf.expand_dims(question_binary_mask, 2), [1]) # [B, L , 2] support_features = tf.concat([tf.expand_dims(word_in_question, 2), tf.expand_dims(wiq_w, 2)], 2) # highway layer to allow for interaction between concatenated embeddings if with_char_embeddings: all_embedded = tf.concat([emb_question, emb_support], 1) all_embedded = tf.contrib.layers.fully_connected(all_embedded, size, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="embeddings_projection") all_embedded_hw = highway_network(all_embedded, 1) emb_question = tf.slice(all_embedded_hw, [0, 0, 0], tf.stack([-1, max_question_length, -1])) emb_support = tf.slice(all_embedded_hw, tf.stack([0, max_question_length, 0]), [-1, -1, -1]) emb_question.set_shape([None, None, size]) emb_support.set_shape([None, None, size]) # variational dropout dropout_shape = tf.unstack(tf.shape(emb_question)) dropout_shape[1] = 1 [emb_question, emb_support] = tf.cond(is_eval, lambda: [emb_question, emb_support], lambda: fixed_dropout([emb_question, emb_support], keep_prob, dropout_shape)) # extend embeddings with features emb_question_ext = tf.concat([emb_question, question_features], 2) emb_support_ext = tf.concat([emb_support, support_features], 2) # encode question and support rnn = tf.contrib.rnn.LSTMBlockFusedCell encoded_question = birnn_with_projection(size, rnn, emb_question_ext, question_length, projection_scope="question_proj") encoded_support = birnn_with_projection(size, rnn, emb_support_ext, support_length, share_rnn=True, projection_scope="support_proj") start_scores, end_scores, predicted_start_pointer, predicted_end_pointer = \ fastqa_answer_layer(size, encoded_question, question_length, encoded_support, support_length, correct_start, answer2question, is_eval, beam_size=shared_vocab_config.config.get("beam_size", 1)) span = tf.stack([predicted_start_pointer, predicted_end_pointer], 1) return start_scores, end_scores, span
def attention_softmax(attn_scores, length=None): attn_scores += misc.mask_for_lengths(length, tf.shape(attn_scores)[2]) return tf.nn.softmax(attn_scores)
def conv_char_embeddings(vocab, size, word_ids, conv_width=5, emb_initializer=tf.random_normal_initializer( 0.0, 0.1), scope=None): """ Args: vocab: filled Vocab instance size: size of embeddings word_ids: tf.Tensor[None, None] or list of tensors conv_width: int emb_initializer: initializer scope: scope Returns: char embedded word ids """ if not isinstance(word_ids, list): word_ids = [word_ids] # create character vocab + word lengths + char ids per word pad_right = math.ceil(conv_width / 2) # "fixed PAD o right side" vocab_size = max(vocab.sym2id.values()) + 1 max_l = max(len(w) for w in vocab.sym2id) + pad_right char_vocab = defaultdict(lambda: len(char_vocab)) char_vocab["PAD"] = 0 word_to_chars_arr = np.zeros((vocab_size, max_l), np.int16) word_lengths_arr = np.zeros([vocab_size], np.int8) for w, i in vocab.sym2id.items(): for k, c in enumerate(w): j = char_vocab[c] word_to_chars_arr[i, k] = j word_lengths_arr[i] = len(w) + conv_width - 1 with tf.variable_scope(scope or "char_embeddings") as vs: word_to_chars = tf.constant(word_to_chars_arr, name="word_to_chars") word_lengths = tf.constant(word_lengths_arr, name="word_lengths") char_embedding_matrix = \ tf.get_variable("char_embedding_matrix", shape=(len(char_vocab), size), initializer=emb_initializer, trainable=True) all_embedded = [] for i, ids in enumerate(zip(word_ids)): if i > 0: vs.reuse_variables() unique_words, word_idx = tf.unique(tf.reshape(ids, [-1])) chars = tf.nn.embedding_lookup(word_to_chars, unique_words) wl = tf.nn.embedding_lookup(word_lengths, unique_words) wl = tf.cast(wl, tf.int32) max_word_length = tf.reduce_max(wl) chars = tf.slice(chars, [0, 0], tf.stack([-1, max_word_length])) embedded_chars = tf.nn.embedding_lookup(char_embedding_matrix, tf.cast(chars, tf.int32)) with tf.variable_scope("conv"): # create filter like this to get fan-in and fan-out right for initializers depending on those filter = tf.get_variable("filter", [conv_width * size, size]) filter_reshaped = tf.reshape(filter, [conv_width, size, size]) conv_out = tf.nn.conv1d(embedded_chars, filter_reshaped, 1, "SAME") conv_mask = tf.expand_dims( misc.mask_for_lengths(wl - pad_right, max_length=max_word_length), 2) conv_out = conv_out + conv_mask unique_embedded_words = tf.reduce_max(conv_out, [1]) embedded_words = tf.gather(unique_embedded_words, word_idx) embedded_words = tf.reshape( embedded_words, tf.stack([-1, tf.unstack(tf.shape(ids))[1], size])) all_embedded.append(embedded_words) return all_embedded
def create_output(self, shared_vocab_config, emb_question, question_length, emb_support, support_length, unique_word_chars, unique_word_char_length, question_words2unique, support_words2unique, word_in_question, correct_start, answer2question, keep_prob, is_eval, answer_type_span): """cbow_baseline_model model. Args: shared_vocab_config: has at least a field config (dict) with keys "rep_dim", "rep_dim_input" emb_question: [Q, L_q, N] question_length: [Q] emb_support: [Q, L_s, N] support_length: [Q] unique_word_chars unique_word_char_length question_words2unique support_words2unique word_in_question: [Q, L_s] correct_start: [A], only during training, window_size.e., is_eval=False answer2question: [A], only during training, window_size.e., is_eval=False keep_prob: [] is_eval: [] answer_type_span: [Q, 2], span within question marking the expected answer type Returns: start_scores [B, L_s, N], end_scores [B, L_s, N], span_prediction [B, 2] """ with tf.variable_scope( "cbow_xqa", initializer=tf.contrib.layers.xavier_initializer()): # Some helpers batch_size = tf.shape(question_length)[0] max_support_length = tf.reduce_max(support_length) max_question_length = tf.reduce_max(question_length) input_size = shared_vocab_config.config["repr_dim_input"] size = shared_vocab_config.config["repr_dim"] with_char_embeddings = shared_vocab_config.config.get( "with_char_embeddings", False) # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) if with_char_embeddings: # compute combined embeddings [char_emb_question, char_emb_support] = conv_char_embedding_alt( shared_vocab_config.char_vocab, size, unique_word_chars, unique_word_char_length, [question_words2unique, support_words2unique]) emb_question = tf.concat([emb_question, char_emb_question], 2) emb_support = tf.concat([emb_support, char_emb_support], 2) input_size += size # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) # variational dropout dropout_shape = tf.unstack(tf.shape(emb_question)) dropout_shape[1] = 1 [emb_question, emb_support] = tf.cond( is_eval, lambda: [emb_question, emb_support], lambda: fixed_dropout([emb_question, emb_support], keep_prob, dropout_shape)) # question encoding answer_type_start = tf.squeeze(tf.slice(answer_type_span, [0, 0], [-1, 1]), axis=0) answer_type_end = tf.squeeze(tf.slice(answer_type_span, [0, 1], [-1, -1]), axis=0) answer_type_mask = misc.mask_for_lengths(answer_type_start, max_question_length, value=1.0) * \ misc.mask_for_lengths(answer_type_end + 1, max_question_length, mask_right=False, value=1.0) answer_type = tf.reduce_sum(emb_question * tf.expand_dims(answer_type_mask, 2), 1) / \ tf.maximum(1.0, tf.reduce_sum(answer_type_mask, 1, keep_dims=True)) batch_size_range = tf.range(0, batch_size) answer_type_start_state = tf.gather_nd( emb_question, tf.stack([batch_size_range, answer_type_start], 1)) answer_type_end_state = tf.gather_nd( emb_question, tf.stack([batch_size_range, answer_type_end], 1)) question_rep = tf.concat( [answer_type, answer_type_start_state, answer_type_end_state], 1) question_rep.set_shape([None, input_size * 3]) # wiq features support_mask = misc.mask_for_lengths(support_length) question_binary_mask = misc.mask_for_lengths(question_length, mask_right=False, value=1.0) v_wiqw = tf.get_variable("v_wiq_w", [1, 1, input_size], initializer=tf.constant_initializer(1.0)) wiq_w = tf.matmul(emb_question * v_wiqw, emb_support, adjoint_b=True) wiq_w = wiq_w + tf.expand_dims(support_mask, 1) wiq_w = tf.reduce_sum( tf.nn.softmax(wiq_w) * tf.expand_dims(question_binary_mask, 2), [1]) wiq_exp = tf.stack([word_in_question, wiq_w], 2) # support span encoding spans = [ tf.stack([ tf.range(0, max_support_length), tf.range(0, max_support_length) ], 1) ] wiq_exp = tf.pad(wiq_exp, [[0, 0], [20, 20], [0, 0]]) wiq_pooled5 = tf.layers.average_pooling1d( tf.slice(wiq_exp, [0, 15, 0], tf.stack([-1, max_support_length + 10, -1])), 5, [1], 'valid') wiq_pooled10 = tf.layers.average_pooling1d( tf.slice(wiq_exp, [0, 10, 0], tf.stack([-1, max_support_length + 20, -1])), 10, [1], 'valid') wiq_pooled20 = tf.layers.average_pooling1d(wiq_exp, 20, [1], 'valid') wiqs_left5 = [ tf.slice(wiq_pooled5, [0, 0, 0], tf.stack([-1, max_support_length, -1])) ] wiqs_right5 = [tf.slice(wiq_pooled5, [0, 6, 0], [-1, -1, -1])] wiqs_left10 = [ tf.slice(wiq_pooled10, [0, 0, 0], tf.stack([-1, max_support_length, -1])) ] wiqs_right10 = [tf.slice(wiq_pooled10, [0, 11, 0], [-1, -1, -1])] wiqs_left20 = [ tf.slice(wiq_pooled20, [0, 0, 0], tf.stack([-1, max_support_length, -1])) ] wiqs_right20 = [tf.slice(wiq_pooled20, [0, 21, 0], [-1, -1, -1])] context_window = 5 padded_support = tf.pad( emb_support, [[0, 0], [context_window, context_window], [0, 0]], "CONSTANT") # [B, L + 10 - 4, S] emb_support_windows = tf.layers.average_pooling1d( padded_support, 5, [1], "VALID", "channels_last") left_context_windows = tf.slice( emb_support_windows, [0, 0, 0], tf.stack([-1, max_support_length, -1])) right_context_windows = tf.slice(emb_support_windows, [0, context_window + 1, 0], [-1, -1, -1]) span_rep = [ tf.concat([ emb_support, emb_support, emb_support, left_context_windows, right_context_windows ], 2) ] for window_size in range(2, _max_span_size + 1): start = tf.slice( emb_support, [0, 0, 0], tf.stack([-1, max_support_length - (window_size - 1), -1])) end = tf.slice(emb_support, [0, window_size - 1, 0], [-1, -1, -1]) averagespan = tf.layers.average_pooling1d( emb_support, window_size, [1], "VALID", "channels_last") left_context_windows = tf.slice( emb_support_windows, [0, 0, 0], tf.stack([-1, max_support_length - (window_size - 1), -1])) right_context_windows = tf.slice( emb_support_windows, [0, window_size - 1 + context_window + 1, 0], [-1, -1, -1]) span_rep.append( tf.concat([ averagespan, start, end, left_context_windows, right_context_windows ], 2)) wiqs_left5.append( tf.slice( wiq_pooled5, [0, 0, 0], tf.stack( [-1, max_support_length - (window_size - 1), -1]))) wiqs_left10.append( tf.slice( wiq_pooled10, [0, 0, 0], tf.stack( [-1, max_support_length - (window_size - 1), -1]))) wiqs_left20.append( tf.slice( wiq_pooled20, [0, 0, 0], tf.stack( [-1, max_support_length - (window_size - 1), -1]))) wiqs_right5.append( tf.slice(wiq_pooled5, [0, window_size + 5, 0], [-1, -1, -1])) wiqs_right10.append( tf.slice(wiq_pooled10, [0, window_size + 10, 0], [-1, -1, -1])) wiqs_right20.append( tf.slice(wiq_pooled20, [0, window_size + 20, 0], [-1, -1, -1])) spans.append( tf.stack([ tf.range(0, max_support_length - (window_size - 1)), tf.range(window_size - 1, max_support_length) ], 1)) span_rep = tf.concat(span_rep, 1) span_rep.set_shape([None, None, input_size * 5]) wiqs_left5 = tf.concat(wiqs_left5, 1) wiqs_left10 = tf.concat(wiqs_left10, 1) wiqs_left20 = tf.concat(wiqs_left20, 1) wiqs_right5 = tf.concat(wiqs_right5, 1) wiqs_right10 = tf.concat(wiqs_right10, 1) wiqs_right20 = tf.concat(wiqs_right20, 1) spans = tf.concat(spans, 0) # scoring with tf.variable_scope("question_rep"): question_rep = tf.layers.dense(question_rep, size, activation=tf.tanh) with tf.variable_scope("question_inter"): question_inter = tf.layers.dense(question_rep, size, activation=None) with tf.variable_scope("span_rep"): span_rep = tf.layers.dense(span_rep, size, activation=tf.tanh) span_question_rep = tf.concat([ span_rep, tf.expand_dims(question_rep, 1) * span_rep, wiqs_left5, wiqs_left10, wiqs_left20, wiqs_right5, wiqs_right10, wiqs_right20 ], 2) span_question_rep.set_shape([None, None, 2 * size + 6 * 2]) with tf.variable_scope("hidden"): h = tf.tanh( tf.layers.dense(span_question_rep, size, activation=None) + tf.expand_dims(question_inter, 1)) with tf.variable_scope("scoring"): span_scores = tf.squeeze( tf.layers.dense(h, 1, activation=None), 2) best_span = tf.argmax(span_scores, 1) predicted_span = tf.gather(spans, best_span) return span_scores, tf.tile(tf.expand_dims(spans, 0), tf.stack([batch_size, 1, 1])), predicted_span
def create_output(self, shared_resources, input_tensors): tensors = TensorPortTensors(input_tensors) with tf.variable_scope( "fast_qa", initializer=tf.contrib.layers.xavier_initializer()): # Some helpers batch_size = tf.shape(tensors.question_length)[0] max_question_length = tf.reduce_max(tensors.question_length) support_mask = misc.mask_for_lengths(tensors.support_length) input_size = shared_resources.config["repr_dim_input"] size = shared_resources.config["repr_dim"] with_char_embeddings = shared_resources.config.get( "with_char_embeddings", False) # set shapes for inputs tensors.emb_question.set_shape([None, None, input_size]) tensors.emb_support.set_shape([None, None, input_size]) emb_question = tensors.emb_question emb_support = tensors.emb_support if with_char_embeddings: # compute combined embeddings [char_emb_question, char_emb_support] = conv_char_embedding( len(shared_resources.char_vocab), size, tensors.word_chars, tensors.word_char_length, [tensors.question_words, tensors.support_words]) emb_question = tf.concat([emb_question, char_emb_question], 2) emb_support = tf.concat([emb_support, char_emb_support], 2) input_size += size # set shapes for inputs emb_question.set_shape([None, None, input_size]) emb_support.set_shape([None, None, input_size]) # compute encoder features question_features = tf.ones( tf.stack([batch_size, max_question_length, 2])) v_wiqw = tf.get_variable("v_wiq_w", [1, 1, input_size], initializer=tf.constant_initializer(1.0)) wiq_w = tf.matmul(tf.gather(emb_question * v_wiqw, tensors.support2question), emb_support, adjoint_b=True) wiq_w = wiq_w + tf.expand_dims(support_mask, 1) question_binary_mask = tf.gather( tf.sequence_mask(tensors.question_length, dtype=tf.float32), tensors.support2question) wiq_w = tf.reduce_sum( tf.nn.softmax(wiq_w) * tf.expand_dims(question_binary_mask, 2), [1]) # [B, L , 2] support_features = tf.stack([tensors.word_in_question, wiq_w], 2) # highway layer to allow for interaction between concatenated embeddings if with_char_embeddings: with tf.variable_scope("char_embeddings") as vs: emb_question = tf.layers.dense( emb_question, size, name="embeddings_projection") emb_question = highway_network(emb_question, 1) vs.reuse_variables() emb_support = tf.layers.dense(emb_support, size, name="embeddings_projection") emb_support = highway_network(emb_support, 1) keep_prob = 1.0 - shared_resources.config.get("dropout", 0.0) emb_question, emb_support = tf.cond( tensors.is_eval, lambda: (emb_question, emb_support), lambda: (tf.nn.dropout(emb_question, keep_prob, noise_shape= [1, 1, emb_question.get_shape()[-1].value]), tf.nn.dropout(emb_support, keep_prob, noise_shape= [1, 1, emb_question.get_shape()[-1].value]))) # extend embeddings with features emb_question_ext = tf.concat([emb_question, question_features], 2) emb_support_ext = tf.concat([emb_support, support_features], 2) # encode question and support encoder_type = shared_resources.config.get('encoder', 'lstm').lower() if encoder_type in ['lstm', 'sru', 'gru']: size = size + 2 if encoder_type == 'sru' else size # to allow for use of residual in SRU encoded_question = encoder(emb_question_ext, tensors.question_length, size, module=encoder_type) encoded_support = encoder(emb_support_ext, tensors.support_length, size, module=encoder_type, reuse=True) projection_initializer = tf.constant_initializer( np.concatenate([np.eye(size), np.eye(size)])) encoded_question = tf.layers.dense( encoded_question, size, tf.tanh, use_bias=False, kernel_initializer=projection_initializer, name='projection_q') encoded_support = tf.layers.dense( encoded_support, size, tf.tanh, use_bias=False, kernel_initializer=projection_initializer, name='projection_s') else: raise ValueError( "Only rnn ('lstm', 'sru', 'gru') encoder allowed for FastQA!" ) answer_layer = shared_resources.config.get('answer_layer', 'conditional').lower() if answer_layer == 'conditional': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.correct_start, tensors.support2question, tensors.answer2support, tensors.is_eval, beam_size=shared_resources.config.get("beam_size", 1), max_span_size=shared_resources.config.get("max_span_size", 10000)) elif answer_layer == 'conditional_bilinear': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.correct_start, tensors.support2question, tensors.answer2support, tensors.is_eval, beam_size=shared_resources.config.get("beam_size", 1), max_span_size=shared_resources.config.get("max_span_size", 10000), bilinear=True) elif answer_layer == 'bilinear': start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \ bilinear_answer_layer(size, encoded_question, tensors.question_length, encoded_support, tensors.support_length, tensors.support2question, tensors.answer2support, tensors.is_eval, beam_size=shared_resources.config.get("beam_size", 1), max_span_size=shared_resources.config.get("max_span_size", 10000)) else: raise ValueError span = tf.stack( [doc_idx, predicted_start_pointer, predicted_end_pointer], 1) return TensorPort.to_mapping(self.output_ports, (start_scores, end_scores, span))
def mask_with_start(scores): return scores + misc.mask_for_lengths(tf.cast(start_pointer, tf.int32), tf.reduce_max(support_length), mask_right=False)