def coattention(query, query_length, document, document_length, sentinel=False): """ DCN+ Coattention layer. Args: query: Tensor of rank 3, shape [N, Q, 2H]. query_length: Tensor of rank 1, shape [N]. Lengths of queries without sentinel. document: Tensor of rank 3, shape [N, D, 2H]. document_length: Tensor of rank 1, shape [N]. Lengths of documents without sentinel. sentinel: Scalar boolean. If True, then sentinel vectors are temporarily left concatenated to the query's and document's second dimension, letting the attention focus on nothing. Returns: A tuple containing: summary matrix of the query, shape [N, Q, 2H]. summary matrix of the document, shape [N, D, 2H]. coattention matrix of the document and query in document space, shape [N, D, 2H]. """ if sentinel: document = concat_sentinel('document_sentinel', document) document_length += 1 query = concat_sentinel('query_sentinel', query) query_length += 1 unmasked_affinity = tf.einsum( 'ndh,nqh->ndq', document, query) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, document_length) attention_p = tf.nn.softmax(affinity, axis=1) unmasked_affinity_t = tf.transpose( unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, query_length) attention_q = tf.nn.softmax(affinity_t, axis=1) summary_q = tf.einsum( 'ndh,ndq->nqh', document, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum( 'nqh,nqd->ndh', query, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel if sentinel: summary_d = summary_d[:, 1:, :] summary_q = summary_q[:, 1:, :] attention_q = attention_q[:, 1:, 1:] coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) return summary_q, summary_d, coattention_d
encoder = RNNEncoder(FLAGS.hidden_size, keep_prob) context_hiddens = encoder.build_graph(context_embs, context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(qn_embs, qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh); # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') question_length = tf.reduce_sum(qn_mask, reduction_indices=1) # shape (batch_size) document_length = tf.reduce_sum(context_mask, reduction_indices=1) # shape (batch_size) unmasked_affinity = tf.einsum('ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, document_length) attention_p = tf.nn.softmax(affinity, dim=1) unmasked_affinity_t = tf.transpose(unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, question_length) attention_q = tf.nn.softmax(affinity_t, dim=1) summary_q = tf.einsum('ndh,ndq->nqh', context_hiddens, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum('nqh,nqd->ndh', question_variation, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) encoder1 = RNNEncoder1(FLAGS.hidden_size, keep_prob) context2 = encoder1.build_graph(summary_d, context_mask) # (batch_size, context_len, hidden_size*2) question2 = encoder1.build_graph(summary_q, qn_mask) # (batch_size, question_len, hidden_size*2) unmasked_affinity1 = tf.einsum('ndh,nqh->ndq', context2, question2) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity1 = maybe_mask_affinity(unmasked_affinity1, document_length)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh) # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') unmasked_affinity = tf.einsum( 'ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, self.document_length) attention_p = tf.nn.softmax(affinity, dim=1) unmasked_affinity_t = tf.transpose( unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, self.question_length) attention_q = tf.nn.softmax(affinity_t, dim=1) summary_q = tf.einsum( 'ndh,ndq->nqh', context_hiddens, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum( 'nqh,nqd->ndh', question_variation, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) context2 = encoder1.build_graph( summary_d, self.context_mask) # (batch_size, context_len, hidden_size*2) question2 = encoder1.build_graph( summary_q, self.qn_mask) # (batch_size, question_len, hidden_size*2) unmasked_affinity1 = tf.einsum( 'ndh,nqh->ndq', context2, question2) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity1 = maybe_mask_affinity(unmasked_affinity1, self.document_length) attention_p1 = tf.nn.softmax(affinity1, dim=1) unmasked_affinity_t1 = tf.transpose( unmasked_affinity1, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1, self.question_length) attention_q1 = tf.nn.softmax(affinity_t1, dim=1) summary_q1 = tf.einsum( 'ndh,ndq->nqh', context2, attention_p1) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d1 = tf.einsum( 'nqh,nqd->ndh', question2, attention_q1) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1) # In[] document_representations = [ context_hiddens, # E^D_1 context2, # E^D_2 summary_d, # S^D_1 summary_d1, # S^D_2 coattention_d, # C^D_1 coattention_d1, # C^D_2 ] document_representation = tf.concat(document_representations, 2) encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob) U = encoder2.build_graph(document_representation, self.context_mask) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, U], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh) # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') unmasked_affinity = tf.einsum( 'ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, self.document_length) attention_p = tf.nn.softmax(affinity, dim=1) unmasked_affinity_t = tf.transpose( unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, self.question_length) attention_q = tf.nn.softmax(affinity_t, dim=1) summary_q = tf.einsum( 'ndh,ndq->nqh', context_hiddens, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum( 'nqh,nqd->ndh', question_variation, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) context2 = encoder1.build_graph( summary_d, self.context_mask) # (batch_size, context_len, hidden_size*2) question2 = encoder1.build_graph( summary_q, self.qn_mask) # (batch_size, question_len, hidden_size*2) unmasked_affinity1 = tf.einsum( 'ndh,nqh->ndq', context2, question2) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity1 = maybe_mask_affinity(unmasked_affinity1, self.document_length) attention_p1 = tf.nn.softmax(affinity1, dim=1) unmasked_affinity_t1 = tf.transpose( unmasked_affinity1, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1, self.question_length) attention_q1 = tf.nn.softmax(affinity_t1, dim=1) summary_q1 = tf.einsum( 'ndh,ndq->nqh', context2, attention_p1) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d1 = tf.einsum( 'nqh,nqd->ndh', question2, attention_q1) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1) # In[] document_representations = [ context_hiddens, # E^D_1 context2, # E^D_2 summary_d, # S^D_1 summary_d1, # S^D_2 coattention_d, # C^D_1 coattention_d1, # C^D_2 ] document_representation = tf.concat(document_representations, 2) encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob) U = encoder2.build_graph(document_representation, self.context_mask) # decoder logits = dcn_decode(U, self.document_length, 100, 4, 4, keep_prob=maybe_dropout(self.keep_prob, True)) last_iter_logit = logits.read(4 - 1) self.logits_start, self.logits_end = last_iter_logit[:, :, 0], last_iter_logit[:, :, 1]