Exemple #1
0
    def mixer(self, q_states, ctx_states):
        # Compute attention of each context word representation with respect to the question final hidden states


        with vs.variable_scope("mixer"):
            # to calculate affinity matrix, need P * Q^T
            # P is shape (?, max_p_len, hid_size), Q is shape (?, max_q_len, hid_size)
            # A will be shape (?, max_p_len, max_q_len)
            A = tf.nn.softmax(batch_matmul(ctx_states, tf.transpose(q_states, perm=[0, 2, 1])))

            # C_P is shape (?, max_p_len, hid_size) = lin. comb. of weights from A over question states
            # These are the context vectors.
            C_P = batch_matmul(A, q_states)

            # First, reshape both C_P and P to make them 2-D
            C_P = tf.reshape(C_P, [-1, self.h_size])
            P = tf.reshape(ctx_states, [-1, self.h_size])

            # Next, use a linear layer to concatenate them along hid_size, and apply a weight matrix
            P_final = tf.nn.rnn_cell._linear([C_P, P], output_size=self.h_size, bias=True)

            # Finally, reshape the output to the correct shape
            P_final = tf.reshape(P_final, [-1, self.p_size, self.h_size])

            return P_final
Exemple #2
0
    def encode_v2(self, question_embeddings, document_embeddings,
                  question_mask, context_mask, encoderb_state_input,
                  dropout_keep_prob):
        """ encode_v2() 
		"""
        with vs.variable_scope("encoder"):
            # Question -> LSTM -> Q
            lstm_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_size)
            question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32),
                                            reduction_indices=1)
            print("Question length: ", question_length)
            Q_prime, _ = dynamic_rnn(lstm_cell,
                                     tf.transpose(question_embeddings,
                                                  [0, 2, 1]),
                                     sequence_length=question_length,
                                     time_major=False,
                                     dtype=tf.float32)
            Q_prime = tf.transpose(Q_prime, [0, 2, 1])
            print("Q_prime: ", Q_prime)

            # Non-linear projection layer on top of the question encoding
            W_Q = tf.get_variable("W_Q",
                                  (self.embedding_size, self.embedding_size))
            b_Q = tf.get_variable("b_Q", (self.embedding_size, 1))
            Q = tf.tanh(
                matrix_multiply_with_batch(matrix=W_Q,
                                           batch=question_embeddings,
                                           matrixByBatch=True) + b_Q)
            print("Q: ", Q)

            # Paragraph -> LSTM -> D
            tf.get_variable_scope().reuse_variables()
            print("Context mask: ", context_mask)
            context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32),
                                           reduction_indices=1)
            D, _ = dynamic_rnn(lstm_cell,
                               tf.transpose(document_embeddings, [0, 2, 1]),
                               sequence_length=context_length,
                               time_major=False,
                               dtype=tf.float32)
            D = tf.transpose(D, [0, 2, 1])
            print("D: ", D)

            L = tf.matmul(tf.transpose(D, [0, 2, 1]), Q)
            A_Q = tf.nn.softmax(L)
            A_D = tf.nn.softmax(tf.transpose(L, [0, 2, 1]))
            print("A_Q: ", A_Q)
            print("A_D: ", A_D)

            C_Q = batch_matmul(D, A_Q)
            print("C_Q: ", C_Q)
            concat = tf.concat(1, [Q, C_Q])
            print("concat: ", concat)
            C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D)
            print("C_D: ", C_D)

            final_D = tf.concat(1, [D, C_D])
            print("final D: ", final_D)
            return final_D
Exemple #3
0
    def mixer(self,
              dropout,
              state_size,
              output_size,
              q_states,
              ctx_states,
              model_type="gru"):
        # Compute attention of each context word representation with respect to the question final hidden states

        if model_type == "gru":
            pass
        elif model_type == "lstm":
            # take 2nd part of state params, since that corresponds to hidden state h
            #knowledge_rep = knowledge_rep[-1]
            final_q_state = final_q_state[-1]
        else:
            raise Exception('Must specify model type.')

        # with vs.variable_scope("mixer"):

        # ht = tf.nn.rnn_cell._linear(q_states, self.flags.state_size, True, 1.0)

        # # ht is shape (batch_size, 1, hid_dim)
        # ht = tf.expand_dims(ht, axis=1)

        with vs.variable_scope("mixer"):
            A = tf.nn.softmax(
                batch_matmul(ctx_states, tf.transpose(q_states, perm=[0, 2,
                                                                      1])))

            # scores is shape (batch_size, N, 1)
            # scores = tf.reduce_sum(A*q_states, reduction_indices=2, keep_dims=True)
            C_P = batch_matmul(A, q_states)
            P = tf.concat(2, [C_P, ctx_states])
            W = tf.get_variable(
                "W_mix",
                shape=(1, 2 * state_size, state_size),
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.get_variable(
                "b_mix",
                shape=(1, output_size, state_size),
                initializer=tf.contrib.layers.xavier_initializer())

            batch_size = tf.shape(P)[0]
            W_tiled = tf.tile(W, [batch_size, 1, 1])
            ctx_state_rep = batch_matmul(P, W_tiled)
            ctx_state_rep = ctx_state_rep + b

        # # do a softmax over the scores
        # scores = tf.exp(scores - tf.reduce_max(scores, reduction_indices=1, keep_dims=True))
        # scores = scores / (1e-6 + tf.reduce_sum(scores, reduction_indices=1, keep_dims=True))

        # # compute context vector using linear combination of attention states with
        # # weights given by attention vector.
        # # context is shape (batch_size, hid_dim)
        # ctx_state_rep = ctx_states * scores

        return ctx_state_rep
Exemple #4
0
    def coattn_encode(self):
        # only for task: direct prediction

        # (length, batch_size, dim)
        query_w_matrix = self.normal_encode(self.encoder_inputs,
                                            self.source_mask)
        context_w_matrix = self.normal_encode(self.ctx_inputs,
                                              self.ctx_mask,
                                              reuse=True)

        # can add a query variation here (optional)
        # can take out coattention mix...but by experiment it should be better than no coattention

        # in PA4 it was also time-major

        # batch, p, size
        p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2])
        # batch, q, size
        q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2])
        # batch, size, q
        q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0])

        # 2). Q->P Attention
        # [256,25,125] vs [128,125,11]
        A = batch_matmul(p_encoding, q_encoding_t)  # (batch, p, q)
        A_p = tf.nn.softmax(A)

        # 3). P->Q Attention
        # transposed: (batch_size, question, context)
        A_t = tf.transpose(A, perm=[0, 2, 1])  # (batch, q, p)
        A_q = tf.nn.softmax(A_t)

        # 4). Query's context vectors
        C_q = batch_matmul(A_q, p_encoding)  # (batch, q, p) * (batch, p, size)
        # (batch, q, size)

        # 5). Paragrahp's context vectors
        q_emb = tf.concat(2, [q_encoding, C_q])
        C_p = batch_matmul(A_p, q_emb)  # (batch, p, q) * (batch, q, size * 2)

        # 6). Linear mix of paragraph's context vectors and paragraph states
        co_att = tf.concat(2, [p_encoding, C_p])  # (batch, p, size * 3)

        # This must be another RNN layer
        # however, if it's just normal attention, we don't need to use a different one
        co_att = tf.transpose(co_att, perm=[1, 0, 2])  # (p, batch, size * 3)
        out = self.normal_encode(co_att, self.ctx_mask, scope_name="Final")

        return out
    def setup_system(self):
        """
        After your modularized implementation of encoder and decoder
        you should call various functions inside encoder, decoder here
        to assemble your reading comprehension system!
        :return:
        """

        # simple encoder stuff here
        question_states, final_question_state = self.question_encoder.encode(self.question_embeddings, self.mask_q_placeholder, 
                                                                             encoder_state_input=None, 
                                                                             attention_inputs=None, 
                                                                             model_type=self.flags.model_type)

        ctx_states, final_ctx_state = self.context_encoder.encode(self.context_embeddings, self.mask_ctx_placeholder, 
                                                                             encoder_state_input=None,#final_question_state, 
                                                                             attention_inputs=None,
                                                                             model_type=self.flags.model_type)



        #ctx_states = self.mixer(final_question_state,ctx_states,model_type=self.flags.model_type)

        feed_states = batch_matmul(ctx_states,tf.expand_dims(final_question_state,2))

        # decoder takes encoded representation to probability dists over start / end index

        #self.start_probs, self.end_probs = self.decoder.decode(knowledge_rep=(final_question_state, final_ctx_state),model_type=self.flags.model_type)


        self.start_probs, self.end_probs = self.decoder.decode(feed_states,self.mask_ctx_placeholder,self.dropout_placeholder,self.flags.state_size,model_type=self.flags.model_type)
Exemple #6
0
    def attention_encode(self):
        # (length, batch_size, dim)
        query_w_matrix = self.normal_encode(self.encoder_inputs,
                                            self.source_mask)
        context_w_matrix = self.normal_encode(self.ctx_inputs,
                                              self.ctx_mask,
                                              reuse=True)

        # can add a query variation here (optional)
        # can take out coattention mix...but by experiment it should be better than no coattention

        # in PA4 it was also time-major

        # batch, p, size
        p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2])
        # batch, q, size
        q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2])
        # batch, size, q
        q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0])

        # 2). Q->P Attention
        # [256,25,125] vs [128,125,11]
        A = batch_matmul(p_encoding, q_encoding_t)  # (batch, p, q)
        A_p = tf.nn.softmax(A)

        # 3). Paragrahp's context vectors
        C_p = batch_matmul(A_p, q_encoding)

        # 4). Linear mix of paragraph's context vectors and paragraph states
        flat_C_p = tf.reshape(C_p, [-1, self.FLAGS.size])
        flat_p_enc = tf.reshape(p_encoding, [-1, self.FLAGS.size])
        doshape = tf.shape(context_w_matrix)
        T, batch_size = doshape[0], doshape[1]

        # mixed_p: (batch * p_len, size)
        mixed_p = rnn_cell._linear([flat_C_p, flat_p_enc],
                                   self.FLAGS.size,
                                   bias=True)
        mixed_p = tf.reshape(mixed_p, tf.pack([T, -1, self.FLAGS.size]))

        # no extra layer of RNN on top of coattention result
        return mixed_p
Exemple #7
0
    def filter(self, Q, P):
        with vs.variable_scope("filter"):
            # Q is (batch_size, q_size, embed_size)
            # P is (batch_size, p_size, embed_size)

            # normalize all embeddings to unit norm so that dot product is cosine similarity
            Qn = tf.nn.l2_normalize(Q, dim=2)
            Pn = tf.nn.l2_normalize(P, dim=2)

            # R is shape (batch_size, q_size, p_size), R_ij = q_i dot p_j
            R = batch_matmul(Qn, tf.transpose(Pn, perm=[0, 2, 1]))

            # collect maximum relevancy over the questions per paragraph word, shape (batch_size, p_size)
            r = tf.reduce_max(R, axis=1)
            r = tf.expand_dims(r, axis=2) # shape (batch_size, p_size, 1) to take advantage of broadcasting

            # re-weight paragraph embeddings with relevancy scores
            P_filtered = P * r

            return P_filtered
Exemple #8
0
    def matching_layer(self, Q_fw, Q_bw, P_fw, P_bw, num_perspectives):
        with vs.variable_scope("matching"):

            # collect all matching vectors into this array, then concatenate at the end
            Q_fw_final = Q_fw[:, -1, :]
            Q_bw_final = Q_bw[:, -1, :]

            Q_fw_n = tf.nn.l2_normalize(Q_fw, dim=2)
            Q_bw_n = tf.nn.l2_normalize(Q_bw, dim=2)
            Q_fw_final_n = tf.nn.l2_normalize(Q_fw_final, dim=1)
            Q_bw_final_n = tf.nn.l2_normalize(Q_bw_final, dim=1)
            P_fw_n = tf.nn.l2_normalize(P_fw, dim=2)
            P_bw_n = tf.nn.l2_normalize(P_bw, dim=2)

            # Full-Matching
            Q_fw_final_n = tf.expand_dims(Q_fw_final_n, 2)
            M_full_fw = batch_matmul(P_fw_n, Q_fw_final_n)

            Q_bw_final_n = tf.expand_dims(Q_bw_final_n, 2)
            M_full_bw = batch_matmul(P_bw_n, Q_bw_final_n)


            # Max-Matching
            M_max_fw = tf.reduce_max(batch_matmul(P_fw_n, tf.transpose(Q_fw_n, perm=[0,2,1])), axis=2, keep_dims=True)
            M_max_bw = tf.reduce_max(batch_matmul(P_bw_n, tf.transpose(Q_bw_n, perm=[0,2,1])), axis=2, keep_dims=True)

            # Mean-Matching
            M_mean_fw = tf.reduce_mean(batch_matmul(P_fw_n, tf.transpose(Q_fw_n, perm=[0,2,1])), axis=2, keep_dims=True)
            M_mean_bw = tf.reduce_mean(batch_matmul(P_bw_n, tf.transpose(Q_bw_n, perm=[0,2,1])), axis=2, keep_dims=True)

            M = []
            M.append(M_full_fw)
            M.append(M_full_bw)
            M.append(M_max_fw)
            M.append(M_max_bw)
            M.append(M_mean_fw)
            M.append(M_mean_bw)
            M = tf.concat(2, M) # concatenate along last dimension (num_perspectives)
            
            #return M
            return P_fw_n + P_bw_n
Exemple #9
0
    def decode(self, knowledge_rep, masks, state_size, model_type="gru"):
        """
        takes in a knowledge representation
        and output a probability estimation over
        all paragraph tokens on which token should be
        the start of the answer span, and which should be
        the end of the answer span.

        :param knowledge_rep: it is a representation of the paragraph and question,
                              decided by how you choose to implement the encoder
        :return:
        """

        with vs.variable_scope("answer_start"):
            W_start = tf.get_variable(
                "W_start",
                shape=(1, 1, state_size),
                initializer=tf.contrib.layers.xavier_initializer())
            batch_size = tf.shape(knowledge_rep)[0]
            W_start = tf.tile(W_start, [batch_size, 1, 1])
            start_probs = batch_matmul(knowledge_rep,
                                       tf.transpose(W_start, perm=[0, 2, 1]))

        with vs.variable_scope("answer_end"):
            cell = tf.nn.rnn_cell.GRUCell(state_size)
            all_end_probs, _ = tf.nn.dynamic_rnn(cell,
                                                 knowledge_rep,
                                                 sequence_length=masks,
                                                 dtype=tf.float32,
                                                 initial_state=None)

            W_end = tf.get_variable(
                "W_end",
                shape=(1, 1, state_size),
                initializer=tf.contrib.layers.xavier_initializer())
            end_probs = tf.reduce_sum(all_end_probs * W_end,
                                      reduction_indices=2)

        start_probs = tf.squeeze(start_probs, 2)

        bool_masks = tf.cast(tf.sequence_mask(masks, maxlen=self.output_size),
                             tf.float32)
        a = tf.constant(-1e30)
        b = tf.constant(1.0)
        add_mask = (a * (b - bool_masks))
        start_probs = start_probs + add_mask
        end_probs = end_probs + add_mask

        # input_size = knowledge_rep.get_shape()[-1]
        # W_start = tf.get_variable("W_start", shape=(input_size, self.output_size),
        #         initializer=tf.contrib.layers.xavier_initializer())
        # b_start = tf.get_variable("b_start", shape=(self.output_size))

        # W_end = tf.get_variable("W_end", shape=(input_size, self.output_size),
        #         initializer=tf.contrib.layers.xavier_initializer())
        # b_end = tf.get_variable("b_end", shape=(self.output_size))

        # start_probs = tf.matmul(knowledge_rep, W_start) + b_start
        # end_probs = tf.matmul(knowledge_rep, W_end) + b_end

        return start_probs, end_probs
Exemple #10
0
    def encode_v2(self, question_embeddings, document_embeddings,
                  question_mask, context_mask, encoderb_state_input,
                  dropout_keep_prob, max_question_len):
        """ encode_v2() 
			"""
        # Shared LSTM cell
        lstm_cell = tf.nn.rnn_cell.LSTMCell(self.state_size)
        lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
            lstm_cell, input_keep_prob=dropout_keep_prob)

        # Question -> LSTM -> Q
        with tf.variable_scope('question_embedding'):
            question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32),
                                            reduction_indices=1)
            Q_prime, _ = dynamic_rnn(lstm_cell,
                                     question_embeddings,
                                     sequence_length=question_length,
                                     dtype=tf.float32)
            print("Q_prime: ", Q_prime)

            # Non-linear projection layer on top of the question encoding
            Q = tf.tanh(batch_linear(Q_prime, max_question_len, True))
            Q = tf.transpose(Q, [0, 2, 1])
            print("Q: ", Q)

        with tf.variable_scope('context_embedding'):
            # Paragraph -> LSTM -> D
            #tf.get_variable_scope().reuse_variables()
            context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32),
                                           reduction_indices=1)
            D, _ = dynamic_rnn(lstm_cell,
                               document_embeddings,
                               sequence_length=context_length,
                               dtype=tf.float32)
            D = tf.transpose(D, [0, 2, 1])
            print("D: ", D)

        with tf.variable_scope('coattention'):
            L = tf.batch_matmul(tf.transpose(D, [0, 2, 1]), Q)
            print("L: ", L)
            A_Q = tf.map_fn(lambda x: tf.nn.softmax(x), L, dtype=tf.float32)
            A_D = tf.map_fn(lambda x: tf.nn.softmax(x),
                            tf.transpose(L, [0, 2, 1]),
                            dtype=tf.float32)
            print("A_Q: ", A_Q)
            print("A_D: ", A_D)

            C_Q = batch_matmul(D, A_Q)
            print("C_Q: ", C_Q)
            concat = tf.concat(1, [Q, C_Q])
            print("concat: ", concat)
            C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D)
            print("C_D: ", C_D)

            # Final coattention context: (batch size, context length, 3*hidden size)
            co_att = tf.concat(1, [D, C_D])
            co_att = tf.transpose(co_att, [0, 2, 1])
            print("co_att: ", co_att)

        with tf.variable_scope('encoder'):
            # LSTM for coattention encoding
            cell_fw = tf.nn.rnn_cell.LSTMCell(self.state_size)
            cell_bw = tf.nn.rnn_cell.LSTMCell(self.state_size)
            cell_fw = tf.nn.rnn_cell.DropoutWrapper(
                cell_fw, input_keep_prob=dropout_keep_prob)
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(
                cell_bw, input_keep_prob=dropout_keep_prob)

            # Compute coattention encoding
            (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                co_att,
                sequence_length=context_length,
                dtype=tf.float32)
            print("fw out: ", fw_out)
            print("bw out: ", bw_out)
            U = tf.concat(2, [fw_out, bw_out])
            print("U: ", U)
            return U
Exemple #11
0
    def encode(
            self, c_len_placeholder,
            q_len_placeholder):  #TODO???, inputs, masks, encoder_state_input):
        """
        In a generalized encode function, you pass in your inputs,
        masks, and an initial
        hidden state input into this function.

        :param inputs: Symbolic representations of your input
        :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate
                      through masked steps
        :param encoder_state_input: (Optional) pass this as initial hidden state
                                    to tf.nn.dynamic_rnn to build conditional representations
        :return: an encoded representation of your input.
                 It can be context-level representation, word-level representation,
                 or both.
        """
        if FLAGS.train_embeddings:
            embeddings = tf.Variable(self.pretrained_embeddings,
                                     name='trainable_embeddings',
                                     dtype=tf.float32)
        else:
            embeddings = tf.constant(self.pretrained_embeddings,
                                     name='pretrained_embeddings',
                                     dtype=tf.float32)

        q_vectors = tf.nn.embedding_lookup(params=embeddings,
                                           ids=self.q_ids_placeholder)
        assert q_vectors.get_shape().as_list() == [
            None, self.max_q, self.embed_size
        ]

        c_vectors = tf.nn.embedding_lookup(params=embeddings,
                                           ids=self.c_ids_placeholder)
        assert c_vectors.get_shape().as_list() == [
            None, self.max_c, self.embed_size
        ]

        #From now on following terminology from https://arxiv.org/pdf/1611.01604.pdf

        l = 2 * self.rnn_size  #TODO sentinel vector
        mplus1 = self.max_c
        nplus1 = self.max_q

        encoding_size = l

        xavier_initializer = tf.contrib.layers.xavier_initializer()

        with tf.variable_scope("Encoder_rnn") as scope:

            if FLAGS.encoder_rnn == 'BiLSTM':
                cell = tf.nn.rnn_cell.LSTMCell(num_units=self.rnn_size,
                                               initializer=xavier_initializer)

            if FLAGS.encoder_rnn == 'BiGRU':
                cell = tf.nn.rnn_cell.GRUCell(num_units=self.rnn_size)

            c_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                cell,
                cell,
                c_vectors,
                dtype=tf.float32,
                sequence_length=c_len_placeholder,
                scope=scope)

            scope.reuse_variables()

            q_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                cell,
                cell,
                q_vectors,
                dtype=tf.float32,
                sequence_length=q_len_placeholder,
                scope=scope)

        D = tf.concat_v2(c_outputs, 2)
        assert D.get_shape().as_list() == [None, mplus1, l]

        Q = tf.concat_v2(q_outputs, 2)
        assert Q.get_shape().as_list() == [None, nplus1, l]

        if FLAGS.cross_id_bias >= 0:
            U = tf.Variable(
                name="U",
                initial_value=FLAGS.cross_id_bias * np.identity(l) +
                tf.random_uniform((l, l), -.01, .01),
                dtype=tf.float32)
            Q = tf.reshape(tf.matmul(tf.reshape(Q, [-1, l]), U),
                           [-1, nplus1, l])  #TODO tensordot

            tf.summary.histogram('Cross_Attn_U', U)

        L = batch_matmul(Q, D, adj_y=True)
        assert L.get_shape().as_list() == [None, nplus1, mplus1]
        tf.summary.histogram('Attn_L', L)

        encoding = D
        encoding_size = l

        if FLAGS.AD:
            if FLAGS.AQ:
                A_Q = softmax_partial(L, 2, c_len_placeholder)
                assert A_Q.get_shape().as_list() == L.get_shape().as_list()
                tf.summary.histogram('A_Q', A_Q)

                C_Q = batch_matmul(A_Q, D)
                assert C_Q.get_shape().as_list() == [None, nplus1, l]
                tf.summary.histogram('C_Q', C_Q)

                Q = tf.concat_v2([Q, C_Q], 2)
                encoding_size += l

            A_DT = softmax_partial(L, 1, q_len_placeholder)
            assert A_DT.get_shape().as_list() == L.get_shape().as_list()
            tf.summary.histogram('A_DT', A_DT)

            C_D = batch_matmul(A_DT, Q, adj_x=True)
            assert C_D.get_shape().as_list() == [None, mplus1, encoding_size]
            tf.summary.histogram('C_D', C_D)

            encoding = tf.concat_v2([encoding, C_D], 2)
            encoding_size += l

        assert encoding.get_shape().as_list() == [None, mplus1, encoding_size]
        return encoding