def mixer(self, q_states, ctx_states): # Compute attention of each context word representation with respect to the question final hidden states with vs.variable_scope("mixer"): # to calculate affinity matrix, need P * Q^T # P is shape (?, max_p_len, hid_size), Q is shape (?, max_q_len, hid_size) # A will be shape (?, max_p_len, max_q_len) A = tf.nn.softmax(batch_matmul(ctx_states, tf.transpose(q_states, perm=[0, 2, 1]))) # C_P is shape (?, max_p_len, hid_size) = lin. comb. of weights from A over question states # These are the context vectors. C_P = batch_matmul(A, q_states) # First, reshape both C_P and P to make them 2-D C_P = tf.reshape(C_P, [-1, self.h_size]) P = tf.reshape(ctx_states, [-1, self.h_size]) # Next, use a linear layer to concatenate them along hid_size, and apply a weight matrix P_final = tf.nn.rnn_cell._linear([C_P, P], output_size=self.h_size, bias=True) # Finally, reshape the output to the correct shape P_final = tf.reshape(P_final, [-1, self.p_size, self.h_size]) return P_final
def encode_v2(self, question_embeddings, document_embeddings, question_mask, context_mask, encoderb_state_input, dropout_keep_prob): """ encode_v2() """ with vs.variable_scope("encoder"): # Question -> LSTM -> Q lstm_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_size) question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32), reduction_indices=1) print("Question length: ", question_length) Q_prime, _ = dynamic_rnn(lstm_cell, tf.transpose(question_embeddings, [0, 2, 1]), sequence_length=question_length, time_major=False, dtype=tf.float32) Q_prime = tf.transpose(Q_prime, [0, 2, 1]) print("Q_prime: ", Q_prime) # Non-linear projection layer on top of the question encoding W_Q = tf.get_variable("W_Q", (self.embedding_size, self.embedding_size)) b_Q = tf.get_variable("b_Q", (self.embedding_size, 1)) Q = tf.tanh( matrix_multiply_with_batch(matrix=W_Q, batch=question_embeddings, matrixByBatch=True) + b_Q) print("Q: ", Q) # Paragraph -> LSTM -> D tf.get_variable_scope().reuse_variables() print("Context mask: ", context_mask) context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) D, _ = dynamic_rnn(lstm_cell, tf.transpose(document_embeddings, [0, 2, 1]), sequence_length=context_length, time_major=False, dtype=tf.float32) D = tf.transpose(D, [0, 2, 1]) print("D: ", D) L = tf.matmul(tf.transpose(D, [0, 2, 1]), Q) A_Q = tf.nn.softmax(L) A_D = tf.nn.softmax(tf.transpose(L, [0, 2, 1])) print("A_Q: ", A_Q) print("A_D: ", A_D) C_Q = batch_matmul(D, A_Q) print("C_Q: ", C_Q) concat = tf.concat(1, [Q, C_Q]) print("concat: ", concat) C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D) print("C_D: ", C_D) final_D = tf.concat(1, [D, C_D]) print("final D: ", final_D) return final_D
def mixer(self, dropout, state_size, output_size, q_states, ctx_states, model_type="gru"): # Compute attention of each context word representation with respect to the question final hidden states if model_type == "gru": pass elif model_type == "lstm": # take 2nd part of state params, since that corresponds to hidden state h #knowledge_rep = knowledge_rep[-1] final_q_state = final_q_state[-1] else: raise Exception('Must specify model type.') # with vs.variable_scope("mixer"): # ht = tf.nn.rnn_cell._linear(q_states, self.flags.state_size, True, 1.0) # # ht is shape (batch_size, 1, hid_dim) # ht = tf.expand_dims(ht, axis=1) with vs.variable_scope("mixer"): A = tf.nn.softmax( batch_matmul(ctx_states, tf.transpose(q_states, perm=[0, 2, 1]))) # scores is shape (batch_size, N, 1) # scores = tf.reduce_sum(A*q_states, reduction_indices=2, keep_dims=True) C_P = batch_matmul(A, q_states) P = tf.concat(2, [C_P, ctx_states]) W = tf.get_variable( "W_mix", shape=(1, 2 * state_size, state_size), initializer=tf.contrib.layers.xavier_initializer()) b = tf.get_variable( "b_mix", shape=(1, output_size, state_size), initializer=tf.contrib.layers.xavier_initializer()) batch_size = tf.shape(P)[0] W_tiled = tf.tile(W, [batch_size, 1, 1]) ctx_state_rep = batch_matmul(P, W_tiled) ctx_state_rep = ctx_state_rep + b # # do a softmax over the scores # scores = tf.exp(scores - tf.reduce_max(scores, reduction_indices=1, keep_dims=True)) # scores = scores / (1e-6 + tf.reduce_sum(scores, reduction_indices=1, keep_dims=True)) # # compute context vector using linear combination of attention states with # # weights given by attention vector. # # context is shape (batch_size, hid_dim) # ctx_state_rep = ctx_states * scores return ctx_state_rep
def coattn_encode(self): # only for task: direct prediction # (length, batch_size, dim) query_w_matrix = self.normal_encode(self.encoder_inputs, self.source_mask) context_w_matrix = self.normal_encode(self.ctx_inputs, self.ctx_mask, reuse=True) # can add a query variation here (optional) # can take out coattention mix...but by experiment it should be better than no coattention # in PA4 it was also time-major # batch, p, size p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2]) # batch, q, size q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2]) # batch, size, q q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0]) # 2). Q->P Attention # [256,25,125] vs [128,125,11] A = batch_matmul(p_encoding, q_encoding_t) # (batch, p, q) A_p = tf.nn.softmax(A) # 3). P->Q Attention # transposed: (batch_size, question, context) A_t = tf.transpose(A, perm=[0, 2, 1]) # (batch, q, p) A_q = tf.nn.softmax(A_t) # 4). Query's context vectors C_q = batch_matmul(A_q, p_encoding) # (batch, q, p) * (batch, p, size) # (batch, q, size) # 5). Paragrahp's context vectors q_emb = tf.concat(2, [q_encoding, C_q]) C_p = batch_matmul(A_p, q_emb) # (batch, p, q) * (batch, q, size * 2) # 6). Linear mix of paragraph's context vectors and paragraph states co_att = tf.concat(2, [p_encoding, C_p]) # (batch, p, size * 3) # This must be another RNN layer # however, if it's just normal attention, we don't need to use a different one co_att = tf.transpose(co_att, perm=[1, 0, 2]) # (p, batch, size * 3) out = self.normal_encode(co_att, self.ctx_mask, scope_name="Final") return out
def setup_system(self): """ After your modularized implementation of encoder and decoder you should call various functions inside encoder, decoder here to assemble your reading comprehension system! :return: """ # simple encoder stuff here question_states, final_question_state = self.question_encoder.encode(self.question_embeddings, self.mask_q_placeholder, encoder_state_input=None, attention_inputs=None, model_type=self.flags.model_type) ctx_states, final_ctx_state = self.context_encoder.encode(self.context_embeddings, self.mask_ctx_placeholder, encoder_state_input=None,#final_question_state, attention_inputs=None, model_type=self.flags.model_type) #ctx_states = self.mixer(final_question_state,ctx_states,model_type=self.flags.model_type) feed_states = batch_matmul(ctx_states,tf.expand_dims(final_question_state,2)) # decoder takes encoded representation to probability dists over start / end index #self.start_probs, self.end_probs = self.decoder.decode(knowledge_rep=(final_question_state, final_ctx_state),model_type=self.flags.model_type) self.start_probs, self.end_probs = self.decoder.decode(feed_states,self.mask_ctx_placeholder,self.dropout_placeholder,self.flags.state_size,model_type=self.flags.model_type)
def attention_encode(self): # (length, batch_size, dim) query_w_matrix = self.normal_encode(self.encoder_inputs, self.source_mask) context_w_matrix = self.normal_encode(self.ctx_inputs, self.ctx_mask, reuse=True) # can add a query variation here (optional) # can take out coattention mix...but by experiment it should be better than no coattention # in PA4 it was also time-major # batch, p, size p_encoding = tf.transpose(context_w_matrix, perm=[1, 0, 2]) # batch, q, size q_encoding = tf.transpose(query_w_matrix, perm=[1, 0, 2]) # batch, size, q q_encoding_t = tf.transpose(query_w_matrix, perm=[1, 2, 0]) # 2). Q->P Attention # [256,25,125] vs [128,125,11] A = batch_matmul(p_encoding, q_encoding_t) # (batch, p, q) A_p = tf.nn.softmax(A) # 3). Paragrahp's context vectors C_p = batch_matmul(A_p, q_encoding) # 4). Linear mix of paragraph's context vectors and paragraph states flat_C_p = tf.reshape(C_p, [-1, self.FLAGS.size]) flat_p_enc = tf.reshape(p_encoding, [-1, self.FLAGS.size]) doshape = tf.shape(context_w_matrix) T, batch_size = doshape[0], doshape[1] # mixed_p: (batch * p_len, size) mixed_p = rnn_cell._linear([flat_C_p, flat_p_enc], self.FLAGS.size, bias=True) mixed_p = tf.reshape(mixed_p, tf.pack([T, -1, self.FLAGS.size])) # no extra layer of RNN on top of coattention result return mixed_p
def filter(self, Q, P): with vs.variable_scope("filter"): # Q is (batch_size, q_size, embed_size) # P is (batch_size, p_size, embed_size) # normalize all embeddings to unit norm so that dot product is cosine similarity Qn = tf.nn.l2_normalize(Q, dim=2) Pn = tf.nn.l2_normalize(P, dim=2) # R is shape (batch_size, q_size, p_size), R_ij = q_i dot p_j R = batch_matmul(Qn, tf.transpose(Pn, perm=[0, 2, 1])) # collect maximum relevancy over the questions per paragraph word, shape (batch_size, p_size) r = tf.reduce_max(R, axis=1) r = tf.expand_dims(r, axis=2) # shape (batch_size, p_size, 1) to take advantage of broadcasting # re-weight paragraph embeddings with relevancy scores P_filtered = P * r return P_filtered
def matching_layer(self, Q_fw, Q_bw, P_fw, P_bw, num_perspectives): with vs.variable_scope("matching"): # collect all matching vectors into this array, then concatenate at the end Q_fw_final = Q_fw[:, -1, :] Q_bw_final = Q_bw[:, -1, :] Q_fw_n = tf.nn.l2_normalize(Q_fw, dim=2) Q_bw_n = tf.nn.l2_normalize(Q_bw, dim=2) Q_fw_final_n = tf.nn.l2_normalize(Q_fw_final, dim=1) Q_bw_final_n = tf.nn.l2_normalize(Q_bw_final, dim=1) P_fw_n = tf.nn.l2_normalize(P_fw, dim=2) P_bw_n = tf.nn.l2_normalize(P_bw, dim=2) # Full-Matching Q_fw_final_n = tf.expand_dims(Q_fw_final_n, 2) M_full_fw = batch_matmul(P_fw_n, Q_fw_final_n) Q_bw_final_n = tf.expand_dims(Q_bw_final_n, 2) M_full_bw = batch_matmul(P_bw_n, Q_bw_final_n) # Max-Matching M_max_fw = tf.reduce_max(batch_matmul(P_fw_n, tf.transpose(Q_fw_n, perm=[0,2,1])), axis=2, keep_dims=True) M_max_bw = tf.reduce_max(batch_matmul(P_bw_n, tf.transpose(Q_bw_n, perm=[0,2,1])), axis=2, keep_dims=True) # Mean-Matching M_mean_fw = tf.reduce_mean(batch_matmul(P_fw_n, tf.transpose(Q_fw_n, perm=[0,2,1])), axis=2, keep_dims=True) M_mean_bw = tf.reduce_mean(batch_matmul(P_bw_n, tf.transpose(Q_bw_n, perm=[0,2,1])), axis=2, keep_dims=True) M = [] M.append(M_full_fw) M.append(M_full_bw) M.append(M_max_fw) M.append(M_max_bw) M.append(M_mean_fw) M.append(M_mean_bw) M = tf.concat(2, M) # concatenate along last dimension (num_perspectives) #return M return P_fw_n + P_bw_n
def decode(self, knowledge_rep, masks, state_size, model_type="gru"): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: it is a representation of the paragraph and question, decided by how you choose to implement the encoder :return: """ with vs.variable_scope("answer_start"): W_start = tf.get_variable( "W_start", shape=(1, 1, state_size), initializer=tf.contrib.layers.xavier_initializer()) batch_size = tf.shape(knowledge_rep)[0] W_start = tf.tile(W_start, [batch_size, 1, 1]) start_probs = batch_matmul(knowledge_rep, tf.transpose(W_start, perm=[0, 2, 1])) with vs.variable_scope("answer_end"): cell = tf.nn.rnn_cell.GRUCell(state_size) all_end_probs, _ = tf.nn.dynamic_rnn(cell, knowledge_rep, sequence_length=masks, dtype=tf.float32, initial_state=None) W_end = tf.get_variable( "W_end", shape=(1, 1, state_size), initializer=tf.contrib.layers.xavier_initializer()) end_probs = tf.reduce_sum(all_end_probs * W_end, reduction_indices=2) start_probs = tf.squeeze(start_probs, 2) bool_masks = tf.cast(tf.sequence_mask(masks, maxlen=self.output_size), tf.float32) a = tf.constant(-1e30) b = tf.constant(1.0) add_mask = (a * (b - bool_masks)) start_probs = start_probs + add_mask end_probs = end_probs + add_mask # input_size = knowledge_rep.get_shape()[-1] # W_start = tf.get_variable("W_start", shape=(input_size, self.output_size), # initializer=tf.contrib.layers.xavier_initializer()) # b_start = tf.get_variable("b_start", shape=(self.output_size)) # W_end = tf.get_variable("W_end", shape=(input_size, self.output_size), # initializer=tf.contrib.layers.xavier_initializer()) # b_end = tf.get_variable("b_end", shape=(self.output_size)) # start_probs = tf.matmul(knowledge_rep, W_start) + b_start # end_probs = tf.matmul(knowledge_rep, W_end) + b_end return start_probs, end_probs
def encode_v2(self, question_embeddings, document_embeddings, question_mask, context_mask, encoderb_state_input, dropout_keep_prob, max_question_len): """ encode_v2() """ # Shared LSTM cell lstm_cell = tf.nn.rnn_cell.LSTMCell(self.state_size) lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, input_keep_prob=dropout_keep_prob) # Question -> LSTM -> Q with tf.variable_scope('question_embedding'): question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32), reduction_indices=1) Q_prime, _ = dynamic_rnn(lstm_cell, question_embeddings, sequence_length=question_length, dtype=tf.float32) print("Q_prime: ", Q_prime) # Non-linear projection layer on top of the question encoding Q = tf.tanh(batch_linear(Q_prime, max_question_len, True)) Q = tf.transpose(Q, [0, 2, 1]) print("Q: ", Q) with tf.variable_scope('context_embedding'): # Paragraph -> LSTM -> D #tf.get_variable_scope().reuse_variables() context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) D, _ = dynamic_rnn(lstm_cell, document_embeddings, sequence_length=context_length, dtype=tf.float32) D = tf.transpose(D, [0, 2, 1]) print("D: ", D) with tf.variable_scope('coattention'): L = tf.batch_matmul(tf.transpose(D, [0, 2, 1]), Q) print("L: ", L) A_Q = tf.map_fn(lambda x: tf.nn.softmax(x), L, dtype=tf.float32) A_D = tf.map_fn(lambda x: tf.nn.softmax(x), tf.transpose(L, [0, 2, 1]), dtype=tf.float32) print("A_Q: ", A_Q) print("A_D: ", A_D) C_Q = batch_matmul(D, A_Q) print("C_Q: ", C_Q) concat = tf.concat(1, [Q, C_Q]) print("concat: ", concat) C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D) print("C_D: ", C_D) # Final coattention context: (batch size, context length, 3*hidden size) co_att = tf.concat(1, [D, C_D]) co_att = tf.transpose(co_att, [0, 2, 1]) print("co_att: ", co_att) with tf.variable_scope('encoder'): # LSTM for coattention encoding cell_fw = tf.nn.rnn_cell.LSTMCell(self.state_size) cell_bw = tf.nn.rnn_cell.LSTMCell(self.state_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper( cell_fw, input_keep_prob=dropout_keep_prob) cell_bw = tf.nn.rnn_cell.DropoutWrapper( cell_bw, input_keep_prob=dropout_keep_prob) # Compute coattention encoding (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, co_att, sequence_length=context_length, dtype=tf.float32) print("fw out: ", fw_out) print("bw out: ", bw_out) U = tf.concat(2, [fw_out, bw_out]) print("U: ", U) return U
def encode( self, c_len_placeholder, q_len_placeholder): #TODO???, inputs, masks, encoder_state_input): """ In a generalized encode function, you pass in your inputs, masks, and an initial hidden state input into this function. :param inputs: Symbolic representations of your input :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate through masked steps :param encoder_state_input: (Optional) pass this as initial hidden state to tf.nn.dynamic_rnn to build conditional representations :return: an encoded representation of your input. It can be context-level representation, word-level representation, or both. """ if FLAGS.train_embeddings: embeddings = tf.Variable(self.pretrained_embeddings, name='trainable_embeddings', dtype=tf.float32) else: embeddings = tf.constant(self.pretrained_embeddings, name='pretrained_embeddings', dtype=tf.float32) q_vectors = tf.nn.embedding_lookup(params=embeddings, ids=self.q_ids_placeholder) assert q_vectors.get_shape().as_list() == [ None, self.max_q, self.embed_size ] c_vectors = tf.nn.embedding_lookup(params=embeddings, ids=self.c_ids_placeholder) assert c_vectors.get_shape().as_list() == [ None, self.max_c, self.embed_size ] #From now on following terminology from https://arxiv.org/pdf/1611.01604.pdf l = 2 * self.rnn_size #TODO sentinel vector mplus1 = self.max_c nplus1 = self.max_q encoding_size = l xavier_initializer = tf.contrib.layers.xavier_initializer() with tf.variable_scope("Encoder_rnn") as scope: if FLAGS.encoder_rnn == 'BiLSTM': cell = tf.nn.rnn_cell.LSTMCell(num_units=self.rnn_size, initializer=xavier_initializer) if FLAGS.encoder_rnn == 'BiGRU': cell = tf.nn.rnn_cell.GRUCell(num_units=self.rnn_size) c_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell, cell, c_vectors, dtype=tf.float32, sequence_length=c_len_placeholder, scope=scope) scope.reuse_variables() q_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell, cell, q_vectors, dtype=tf.float32, sequence_length=q_len_placeholder, scope=scope) D = tf.concat_v2(c_outputs, 2) assert D.get_shape().as_list() == [None, mplus1, l] Q = tf.concat_v2(q_outputs, 2) assert Q.get_shape().as_list() == [None, nplus1, l] if FLAGS.cross_id_bias >= 0: U = tf.Variable( name="U", initial_value=FLAGS.cross_id_bias * np.identity(l) + tf.random_uniform((l, l), -.01, .01), dtype=tf.float32) Q = tf.reshape(tf.matmul(tf.reshape(Q, [-1, l]), U), [-1, nplus1, l]) #TODO tensordot tf.summary.histogram('Cross_Attn_U', U) L = batch_matmul(Q, D, adj_y=True) assert L.get_shape().as_list() == [None, nplus1, mplus1] tf.summary.histogram('Attn_L', L) encoding = D encoding_size = l if FLAGS.AD: if FLAGS.AQ: A_Q = softmax_partial(L, 2, c_len_placeholder) assert A_Q.get_shape().as_list() == L.get_shape().as_list() tf.summary.histogram('A_Q', A_Q) C_Q = batch_matmul(A_Q, D) assert C_Q.get_shape().as_list() == [None, nplus1, l] tf.summary.histogram('C_Q', C_Q) Q = tf.concat_v2([Q, C_Q], 2) encoding_size += l A_DT = softmax_partial(L, 1, q_len_placeholder) assert A_DT.get_shape().as_list() == L.get_shape().as_list() tf.summary.histogram('A_DT', A_DT) C_D = batch_matmul(A_DT, Q, adj_x=True) assert C_D.get_shape().as_list() == [None, mplus1, encoding_size] tf.summary.histogram('C_D', C_D) encoding = tf.concat_v2([encoding, C_D], 2) encoding_size += l assert encoding.get_shape().as_list() == [None, mplus1, encoding_size] return encoding