def coattention(query,
                query_length,
                document,
                document_length,
                sentinel=False):
    """ DCN+ Coattention layer.
    
    Args:  
        query: Tensor of rank 3, shape [N, Q, 2H].  
        query_length: Tensor of rank 1, shape [N]. Lengths of queries without sentinel.  
        document: Tensor of rank 3, shape [N, D, 2H].   
        document_length: Tensor of rank 1, shape [N]. Lengths of documents without sentinel.  
        sentinel: Scalar boolean. If True, then sentinel vectors are temporarily left concatenated 
        to the query's and document's second dimension, letting the attention focus on nothing.  

    Returns:  
        A tuple containing:  
            summary matrix of the query, shape [N, Q, 2H].  
            summary matrix of the document, shape [N, D, 2H].  
            coattention matrix of the document and query in document space, shape [N, D, 2H].
    """
    if sentinel:
        document = concat_sentinel('document_sentinel', document)
        document_length += 1
        query = concat_sentinel('query_sentinel', query)
        query_length += 1

    unmasked_affinity = tf.einsum(
        'ndh,nqh->ndq', document,
        query)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
    affinity = maybe_mask_affinity(unmasked_affinity, document_length)
    attention_p = tf.nn.softmax(affinity, axis=1)
    unmasked_affinity_t = tf.transpose(
        unmasked_affinity, [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
    affinity_t = maybe_mask_affinity(unmasked_affinity_t, query_length)
    attention_q = tf.nn.softmax(affinity_t, axis=1)
    summary_q = tf.einsum(
        'ndh,ndq->nqh', document,
        attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
    summary_d = tf.einsum(
        'nqh,nqd->ndh', query,
        attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel

    if sentinel:
        summary_d = summary_d[:, 1:, :]
        summary_q = summary_q[:, 1:, :]
        attention_q = attention_q[:, 1:, 1:]
    coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

    return summary_q, summary_d, coattention_d
Ejemplo n.º 2
0
encoder = RNNEncoder(FLAGS.hidden_size, keep_prob)
context_hiddens = encoder.build_graph(context_embs, context_mask) # (batch_size, context_len, hidden_size*2)
question_hiddens = encoder.build_graph(qn_embs, qn_mask) # (batch_size, question_len, hidden_size*2)

question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh);
        

# In[]

#question_length = tf.placeholder(tf.int32, (None,), name='question_length')
#document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')
question_length = tf.reduce_sum(qn_mask, reduction_indices=1) # shape (batch_size)
document_length = tf.reduce_sum(context_mask, reduction_indices=1) # shape (batch_size)

unmasked_affinity = tf.einsum('ndh,nqh->ndq', context_hiddens, question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
affinity = maybe_mask_affinity(unmasked_affinity, document_length)
attention_p = tf.nn.softmax(affinity, dim=1)
unmasked_affinity_t = tf.transpose(unmasked_affinity, [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
affinity_t = maybe_mask_affinity(unmasked_affinity_t, question_length)
attention_q = tf.nn.softmax(affinity_t, dim=1)
summary_q = tf.einsum('ndh,ndq->nqh', context_hiddens, attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
summary_d = tf.einsum('nqh,nqd->ndh', question_variation, attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

encoder1 = RNNEncoder1(FLAGS.hidden_size, keep_prob)
context2 = encoder1.build_graph(summary_d, context_mask) # (batch_size, context_len, hidden_size*2)
question2 = encoder1.build_graph(summary_q, qn_mask) # (batch_size, question_len, hidden_size*2)


unmasked_affinity1 = tf.einsum('ndh,nqh->ndq', context2, question2)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
affinity1 = maybe_mask_affinity(unmasked_affinity1, document_length)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        question_variation = tf.layers.dense(question_hiddens,
                                             question_hiddens.get_shape()[2],
                                             activation=tf.tanh)

        # In[]

        #question_length = tf.placeholder(tf.int32, (None,), name='question_length')
        #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')

        unmasked_affinity = tf.einsum(
            'ndh,nqh->ndq', context_hiddens,
            question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity = maybe_mask_affinity(unmasked_affinity, self.document_length)
        attention_p = tf.nn.softmax(affinity, dim=1)
        unmasked_affinity_t = tf.transpose(
            unmasked_affinity,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t = maybe_mask_affinity(unmasked_affinity_t,
                                         self.question_length)
        attention_q = tf.nn.softmax(affinity_t, dim=1)
        summary_q = tf.einsum(
            'ndh,ndq->nqh', context_hiddens,
            attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d = tf.einsum(
            'nqh,nqd->ndh', question_variation,
            attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

        encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
        context2 = encoder1.build_graph(
            summary_d,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question2 = encoder1.build_graph(
            summary_q,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        unmasked_affinity1 = tf.einsum(
            'ndh,nqh->ndq', context2,
            question2)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity1 = maybe_mask_affinity(unmasked_affinity1,
                                        self.document_length)
        attention_p1 = tf.nn.softmax(affinity1, dim=1)
        unmasked_affinity_t1 = tf.transpose(
            unmasked_affinity1,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1,
                                          self.question_length)
        attention_q1 = tf.nn.softmax(affinity_t1, dim=1)
        summary_q1 = tf.einsum(
            'ndh,ndq->nqh', context2,
            attention_p1)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d1 = tf.einsum(
            'nqh,nqd->ndh', question2,
            attention_q1)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1)

        # In[]
        document_representations = [
            context_hiddens,  # E^D_1
            context2,  # E^D_2
            summary_d,  # S^D_1
            summary_d1,  # S^D_2
            coattention_d,  # C^D_1
            coattention_d1,  # C^D_2
        ]

        document_representation = tf.concat(document_representations, 2)
        encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob)
        U = encoder2.build_graph(document_representation, self.context_mask)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, U],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Ejemplo n.º 4
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        question_variation = tf.layers.dense(question_hiddens,
                                             question_hiddens.get_shape()[2],
                                             activation=tf.tanh)

        # In[]

        #question_length = tf.placeholder(tf.int32, (None,), name='question_length')
        #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')

        unmasked_affinity = tf.einsum(
            'ndh,nqh->ndq', context_hiddens,
            question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity = maybe_mask_affinity(unmasked_affinity, self.document_length)
        attention_p = tf.nn.softmax(affinity, dim=1)
        unmasked_affinity_t = tf.transpose(
            unmasked_affinity,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t = maybe_mask_affinity(unmasked_affinity_t,
                                         self.question_length)
        attention_q = tf.nn.softmax(affinity_t, dim=1)
        summary_q = tf.einsum(
            'ndh,ndq->nqh', context_hiddens,
            attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d = tf.einsum(
            'nqh,nqd->ndh', question_variation,
            attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

        encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
        context2 = encoder1.build_graph(
            summary_d,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question2 = encoder1.build_graph(
            summary_q,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        unmasked_affinity1 = tf.einsum(
            'ndh,nqh->ndq', context2,
            question2)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity1 = maybe_mask_affinity(unmasked_affinity1,
                                        self.document_length)
        attention_p1 = tf.nn.softmax(affinity1, dim=1)
        unmasked_affinity_t1 = tf.transpose(
            unmasked_affinity1,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1,
                                          self.question_length)
        attention_q1 = tf.nn.softmax(affinity_t1, dim=1)
        summary_q1 = tf.einsum(
            'ndh,ndq->nqh', context2,
            attention_p1)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d1 = tf.einsum(
            'nqh,nqd->ndh', question2,
            attention_q1)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1)

        # In[]
        document_representations = [
            context_hiddens,  # E^D_1
            context2,  # E^D_2
            summary_d,  # S^D_1
            summary_d1,  # S^D_2
            coattention_d,  # C^D_1
            coattention_d1,  # C^D_2
        ]

        document_representation = tf.concat(document_representations, 2)
        encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob)
        U = encoder2.build_graph(document_representation, self.context_mask)
        # decoder
        logits = dcn_decode(U,
                            self.document_length,
                            100,
                            4,
                            4,
                            keep_prob=maybe_dropout(self.keep_prob, True))
        last_iter_logit = logits.read(4 - 1)
        self.logits_start, self.logits_end = last_iter_logit[:, :,
                                                             0], last_iter_logit[:, :,
                                                                                 1]