def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if self.FLAGS.attention_type == 'dot_product': print("<<<<<<<< Adding dot_poduct attention >>>") attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) elif self.FLAGS.attention_type == 'self_attention': print("<<<<<<<<< Adding Self attention over basic attention >>>>>>>") basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.self_attn_zsize, self.FLAGS.hidden_size*2) _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask) concated_basic_self = tf.concat([basic_attn_output,self_attn_output], axis=2) #(bs,N,4h) self_attn_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps = self_attn_encoder.build_graph(concated_basic_self, self.context_mask, scope_name="self_attn_encoder") # (batch_size, N, hidden_size*2) elif self.FLAGS.attention_type == 'bidaf': print("<<<<<<<<< Adding BIDAF attention >>>>>>>") attn_layer = BidafAttn(self.keep_prob, self.FLAGS.hidden_size*2) c2q_attention, q2c_attention = attn_layer.build_graph(context_hiddens, question_hiddens, self.qn_mask, self.context_mask) # Combined tensors o get final output..... body_c2q_attention_mult = context_hiddens*c2q_attention # (batch_size, num_keys(N), 2h) q2c_expanded = tf.expand_dims(q2c_attention, 1) #(bs,1,2h) body_q2c_attention_mult = context_hiddens*q2c_expanded # (batch_size, num_keys(N), 2h) blended_reps = tf.concat([c2q_attention, body_c2q_attention_mult, body_q2c_attention_mult], axis=2) #(bs,N,6h) # context_hiddens removed blended_reps = tf.nn.dropout(blended_reps, self.keep_prob) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("ClassProb"): softmax_layer_class = CustomSimpleSoftmaxLayer() #Both have dimesions: shape (batch_size, 4) self.logits_class, self.probdist_class = softmax_layer_class.build_graph(blended_reps_final, self.context_mask, self.FLAGS.reduction_type)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. """ # Use a RNN to get hidden states for the context and the question encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask, False) # Use softmax layer to compute probability distribution for end location with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask, False)
def build_graph(self): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) attn_layer = R_Net_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS) output = attn_layer.build_graph( attn_output, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) blended_reps_final = tf.contrib.layers.fully_connected( tf.concat([attn_output, output], 2), num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) print self.context_embs.shape context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states biattn_layer = AttentionFlowLayer(self.keep_prob, self.FLAGS.l2_lambda) biattn_output = biattn_layer.build_graph(context_hiddens, self.context_mask, question_hiddens, self.qn_mask, scope="AttnFlow") #RNNEncoder layer model_layer = Model_Layer(self.FLAGS.hidden_size, self.keep_prob) model_output = model_layer.build_graph(biattn_output, self.context_mask) #Fully connected # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( model_output, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) #What are the masks used for? # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. self.c2q_dist : (batch_size, context_len, question_len) Context to Question attention probability. Each row should sum to 1 except if the context word is masked. self.q2c_dist : (batch_size, context_len) Question to Context attention probability. Each row should sum to 1. """ print("Building BIDAF") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) self.c2q_attn_dist, self.q2c_attn_dist, attn_output = \ attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*4) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model. """ with vs.variable_scope("context"): context_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = context_encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("question"): question_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) question_hiddens = question_encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) question_last_hidden = tf.reshape(question_hiddens[:, -1, :], (-1, 2 * self.FLAGS.hidden_size)) question_last_hidden = tf.contrib.layers.fully_connected( question_last_hidden, num_outputs=self.FLAGS.hidden_size) # Use context hidden states to attend to question hidden states # attn_output is shape (batch_size, context_len, hidden_size*2) # The following is BiDAF attention if self.FLAGS.use_bidaf: attn_layer = BiDAF(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self. context_mask) # (batch_size, context_len, hidden_size * 6) else: # otherwise, basic attention attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size) decoder = RNNDecoder(self.FLAGS.batch_size, self.FLAGS.hidden_size, self.ans_vocab_size, self.FLAGS.answer_len, self.ans_embedding_matrix, self.keep_prob, sampling_prob=self.sampling_prob, schedule_embed=self.FLAGS.schedule_embed, pred_method=self.FLAGS.pred_method) (self.train_logits, self.train_translations, _), \ (self.dev_logits, self.dev_translations, self.attention_results) = decoder.build_graph(blended_reps_final, question_last_hidden, self.ans_embs, self.ans_mask, self.ans_ids, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ with vs.variable_scope("Encoder"): # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # shapes are U_tilde: (batch_size, context_len, 2h), H_tilde: (batch_size, context_len, 1) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*8) with vs.variable_scope("M1_init"): # Bidirectional GRU M1 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1_init = modeling_layer.build_graph(blended_reps, self.context_mask) # (batch_size, N, 2h) with vs.variable_scope("M1"): # Bidrectional GRU M2 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1 = modeling_layer.build_graph(blended_reps_1_init, self.context_mask) # (batch_size, N, 2h) with vs.variable_scope("M2"): # Bidrectional GRU M2 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_2 = modeling_layer.build_graph(blended_reps_1, self.context_mask) # (batch_size, N, 2h) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_1], axis=2), self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_2], axis=2), self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) encoderQ = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask,"rnnencoder1") # (batch_size, context_len, hidden_size*2) question_hiddens = encoderQ.build_graph(self.qn_embs, self.qn_mask,"rnnencoderQ") # (batch_size, question_len, ,"rnnencoder1"hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output,new_attn = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,2*self.FLAGS.hidden_size) # attn_output is shape (batch_size, context_len, hidden_size*2) _,_,blended_reps_final=build_graph_middle(self,new_attn,attn_output,context_hiddens,question_hiddens) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask) '''
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph_coattention(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. """ # Use a RNN to get hidden states for the context and the question encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Compute both sided attention coatt= Coattention() co_att= coatt.build_graph(self.FLAGS.batch_size,question_hiddens, context_hiddens, self.FLAGS.question_len, self.FLAGS.context_len, 2*self.FLAGS.hidden_size, self.keep_prob) co_att_final = tf.contrib.layers.fully_connected(co_att, num_outputs=self.FLAGS.hidden_size) # Use softmax layer to compute probability distribution for start location with vs.variable_scope("StartDist") as scp: softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(co_att_final, self.context_mask, True) scp.reuse_variables() # Use softmax layer to compute probability distribution for end location with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(co_att_final, self.context_mask, True)
def build_graph(self): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.max_word_len: context_hiddens = encoder.build_graph( tf.concat([self.context_embs, self.context_char_hidden], 2), self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( tf.concat([self.qn_embs, self.qn_char_hidden], 2), self.qn_mask) # (batch_size, question_len, hidden_size*2) else: context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) attn_layer = BiDAF_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, [ self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.question_len ]) output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) blended_reps_final = tf.contrib.layers.fully_connected( output, num_outputs=self.FLAGS.hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): attn_layer = DynamicAttention_Attn(self.keep_prob, self.FLAGS) output = attn_layer.build_graph( self.qn_embs, self.qn_mask, self.context_embs, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) encoder = RNNEncoder(self.FLAGS.embedding_size * 2, self.keep_prob) context_hiddens = encoder.build_graph( output, self.context_mask) # (batch_size, context_len, embedding_size*4) blended_reps_final = tf.contrib.layers.fully_connected( context_hiddens, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE): if self.FLAGS.use_stacked_encoder: encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob) else: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if self.FLAGS.num_encoding_layers > 1: with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE): encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask) question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask) # Use context hidden states to attend to question hidden states if self.FLAGS.bidaf: attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2) blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8) else: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Self-attention layer if self.FLAGS.self_attend: self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz) blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked # from BiDAF paper) to the hidden states from the attention layer. if self.FLAGS.modeling_layer: with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE): model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1: with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE): model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask) # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob) # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.pointer_network: #TODO: define flag with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE): pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size) (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \ pointer_network.build_graph(blended_reps_final, self.context_mask) else: # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
from tensorflow.python.ops import variable_scope as vs from tensorflow.python.ops import embedding_ops # In[] with vs.variable_scope("embeddings"): # Note: the embedding matrix is a tf.constant which means it's not a trainable parameter embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size) # Get the word embeddings for the context and question, # using the placeholders self.context_ids and self.qn_ids context_embs = embedding_ops.embedding_lookup(embedding_matrix, context_ids) # shape (batch_size, context_len, embedding_size) qn_embs = embedding_ops.embedding_lookup(embedding_matrix, qn_ids) # shape (batch_size, question_len, embedding_size) encoder = RNNEncoder(FLAGS.hidden_size, keep_prob) context_hiddens = encoder.build_graph(context_embs, context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(qn_embs, qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh); # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') question_length = tf.reduce_sum(qn_mask, reduction_indices=1) # shape (batch_size) document_length = tf.reduce_sum(context_mask, reduction_indices=1) # shape (batch_size) unmasked_affinity = tf.einsum('ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, document_length) attention_p = tf.nn.softmax(affinity, dim=1)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.h_hidden_size, self.keep_prob, num_layers=self.FLAGS.h_num_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type) if self.FLAGS.share_encoder: question_hiddens, question_states_fw, question_states_bw = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) else: question_encoder = RNNEncoder(self.FLAGS.h_hidden_size, self.keep_prob, num_layers=self.FLAGS.h_num_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='question_encoder') question_hiddens, question_states_fw, question_states_bw = question_encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if not self.FLAGS.reuse_question_states: question_states_fw, question_states_bw = None, None context_hiddens, _, _ = encoder.build_graph( self.context_embs, self.context_mask, initial_states_fw=question_states_fw, initial_states_bw=question_states_bw ) # (batch_size, context_len, hidden_size*2) if self.FLAGS.use_bidaf: attn_layer = BiDAF(self.keep_prob) context_att, question_att = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps = tf.concat([ context_hiddens, context_att, context_hiddens * context_att, context_hiddens * question_att ], axis=2) else: # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*4) if self.FLAGS.modeling_layer_uses_rnn: modelling_encoder = RNNEncoder( self.FLAGS.h_model_size, self.keep_prob, num_layers=self.FLAGS.h_model_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='blended_reps_scope') blended_reps_final, model_states_fw, model_states_bw = modelling_encoder.build_graph( blended_reps, self.context_mask) else: # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.h_hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): if self.FLAGS.use_rnn_for_ends: end_encoder = RNNEncoder(self.FLAGS.h_model_size, self.keep_prob, num_layers=self.FLAGS.h_model_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='blended_reps_final') blended_reps_combined = tf.concat([ blended_reps_final, tf.expand_dims(self.probdist_start, 2) ], 2) blended_reps_final, _, _ = end_encoder.build_graph( blended_reps_combined, self.context_mask, initial_states_fw=model_states_fw, initial_states_bw=model_states_bw) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("e1c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("e1q"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # con_qn_hiddens = encoder.build_graph(self.con_qn_embs, self.con_qn_mask) # context_hiddens = con_qn_hiddens[:, :self.FLAGS.context_len, :] # question_hiddens = con_qn_hiddens[:, self.FLAGS.context_len:, :] # with vs.variable_scope("e2"): # encoder1 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # context_hiddens = encoder1.build_graph(context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2) # question_hiddens = encoder1.build_graph(question_hiddens, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) # _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) with vs.variable_scope("a1"): attn_layer = BidirectionalAttnNew(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps # blended_reps_c = tf.concat([context_hiddens, attn_output_val], axis=2) # (batch_size, context_len, hidden_size*4) # blended_reps_q = tf.concat([question_hiddens, attn_output_key], axis=2) with vs.variable_scope("e2_1c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens_f = encoder.build_graph( attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) # with vs.variable_scope("a2"): # attn_layer1 = BidirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # _, _, attn_output_val, attn_output_key = attn_layer1.build_graph(question_hiddens, # self.qn_mask, # context_hiddens, # self.context_mask) blended_reps_st = tf.concat([context_hiddens_f, attn_output], axis=2) with vs.variable_scope("e3c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens_f_end = encoder.build_graph( context_hiddens_f, self.context_mask) blended_reps_end = tf.concat([context_hiddens_f_end, attn_output], axis=2) # with vs.variable_scope("AnsPoiStRNN"): # encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # start_hidden = encoder.build_graph(blended_reps, self.context_mask) # print "OK1" # with vs.variable_scope("AnsPoiStATT"): # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # start_att_dis, start_att_out = attn_layer.build_graph(question_hiddens, self.qn_mask, start_hidden) # print start_att_dis.shape, start_att_out.shape # print "OK2" # with vs.variable_scope("AnsPoiEnRNN"): # encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # end_hidden = encoder.build_graph(start_att_out, self.context_mask) # print "OK3" # with vs.variable_scope("AnsPoiStATT"): # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # end_att_dis, _ = attn_layer.build_graph(end_hidden, self.context_mask, question_hiddens) # print "OK4" # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final_st = tf.contrib.layers.fully_connected( blended_reps_st, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final_end = tf.contrib.layers.fully_connected( blended_reps_end, num_outputs=self.FLAGS.hidden_size) # print "###", blended_reps_final.shape # print start_att_dis.shape, end_att_dis.shape # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final_st, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.model == "baseline" : encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn": print("INSIDE the BIDAF model") encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn": encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn": context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "baseline" : attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "coatt" : #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2) context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps_final = attn_output #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): contextLen = tf.reduce_sum(self.context_mask, axis=1) cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size) (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32) U_1 = tf.concat([fw_out, bw_out], axis=2) out = tf.nn.dropout(U_1, self.keep_prob) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask) elif self.FLAGS.model =="bidaf" or self.FLAGS.model=="bidaf_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) if self.FLAGS.model == "bidaf_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): # Concatenate the start logits with the modelling layer output to get the input to the # end word lstm #self.logits_start has a shape of #(batch_size, context_len) logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1) end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2) # LSTM end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob) blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask) blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask) elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) if self.FLAGS.model == "bidaf_dynamic_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # We now feed this to dynamic decoder module coded in Answer decoder # the output of the decoder are start, end, alpha_logits and beta_logits # start and end have a shape of (batch_size, num_iterations) #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim) decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = mod_layer_out[:,0,:] u_e_init = mod_layer_out[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1] elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn": context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) if self.FLAGS.model == "coatt_dynamic_self_attn": CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) attn_output = U #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = attn_output[:,0,:] u_e_init = attn_output[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_embs = self.context_embs qn_embs = self.qn_embs if self.FLAGS.enable_cnn: context_embs = tf.concat( [self.context_embs, self.context_char_embs], axis=2) qn_embs = tf.concat([self.qn_embs, self.qn_char_embs], axis=2) context_hiddens = encoder.build_graph( context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Encode query-aware representations of the context words bidaf_attn_layer = BidafAttn(self.keep_prob, self.FLAGS.context_len, self.FLAGS.hidden_size * 2) bidaf_out = bidaf_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8) # Condense the information: hidden_size*8 --> hidden_size*2 bidaf_out = tf.contrib.layers.fully_connected( bidaf_out, num_outputs=self.FLAGS.hidden_size * 2, normalizer_fn=tf.contrib.layers.batch_norm ) # (batch_size, context_len, hidden_size*2) # Co-attention co_attn_layer = CoAttnLite(self.keep_prob, self.FLAGS.hidden_size, self.FLAGS.hidden_size * 2) co_out = co_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2) bico_out = tf.concat([bidaf_out, co_out], 2) # (batch_size, context_len, hidden_size*4) # Capture interactions among context words conditioned on the query. gru_layer1 = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (hidden_size*4 + hidden_size) * hidden_size * 2 * 3 model_reps1 = gru_layer1.build_graph( bico_out, self.context_mask, variable_scope='ModelGRU1' ) # (batch_size, context_len, hidden_size*2) gru_layer2 = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3 model_reps2 = gru_layer2.build_graph( model_reps1, self.context_mask, variable_scope='ModelGRU2' ) # (batch_size, context_len, hidden_size*2) # Self Attention & GRU layer parallel to GRU layer2. with tf.variable_scope('SelfAttnGRU'): self_attn_layer = MulAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) se_attn = self_attn_layer.build_graph( model_reps1, self.context_mask, model_reps1, self.context_mask) # (batch_size, context_len, hidden_size*2) se_gru_layer = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3 se_out = se_gru_layer.build_graph( se_attn, self.context_mask, variable_scope='SelfGRU' ) # (batch_size, context_len, hidden_size*2) model_reps = tf.concat([model_reps2, se_out], 2) # (batch_size, context_len, hidden_size*4) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): start_reps = tf.concat( [bico_out, model_reps], 2) # (batch_size, context_len, hidden_size*10) softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( start_reps, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): gru_end_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) model_end_reps = gru_end_layer.build_graph( model_reps, self.context_mask, variable_scope='EndGRU' ) # (batch_size, context_len, hidden_size*2) end_reps = tf.concat( [bico_out, model_end_reps], 2) # (batch_size, context_len, hidden_size*10) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( end_reps, self.context_mask) for variable in tf.trainable_variables(): tf.summary.histogram(variable.name.replace(':', '/'), variable)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # character-level CNN to get hybrid word embeddings charCnn = CharCNN(self.FLAGS.word_len, self.FLAGS.char_embedding_size, self.FLAGS.num_filters, self.FLAGS.kernel_size) # (batch_size, context_len, num_filters) char_context_hiddens = charCnn.build_graph(self.char_context_embs, self.char_context_mask, self.FLAGS.context_len) # (batch_size, question_len, num_filters) char_qn_hiddens = charCnn.build_graph(self.char_qn_embs, self.char_qn_mask, self.FLAGS.question_len) # hybrid word embeddings hybrid_context_embs = tf.concat( [self.context_embs, char_context_hiddens], axis=-1) # (batch_size, context_len, emb_size+char_emb_size) hybrid_qn_embs = tf.concat( [self.qn_embs, char_qn_hiddens], axis=-1) # (batch_size, question_len, emb_size+char_emb_size) # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, "GRU") context_hiddens = encoder.build_graph( hybrid_context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( hybrid_qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # coattention has been the best attention model I've found attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) u = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens) # shape (batch_size, context_len, 8*hidden_size) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( u, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """ Builds the main part of the graph for the model Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # NOTE CHANGE: concantanate glove and elmo embedding # How to handle elmo context_len and glove context_len mismatch? # Just make the context_ids no max context_len context_embs_concat = tf.concat( [self.elmo_context_input, self.context_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) qn_embs_concat = tf.concat( [self.elmo_question_input, self.qn_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) #set shape so that it can pass to dynamic lstm context_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) qn_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) self.qn_mask.set_shape((None, None)) self.context_mask.set_shape((None, None)) with tf.variable_scope("biLSTM"): Encoder = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=1024 + self.FLAGS.embedding_size) #shared weights (same scope) context_hiddens = Encoder.build_graph( context_embs_concat, self.context_mask, scope="context_question_encoder" ) #(batch_size, context_len, hidden_size*2) question_hiddens = Encoder.build_graph( qn_embs_concat, self.qn_mask, scope="context_question_encoder" ) #(batch_size, question_len, hidden_size*2) with tf.variable_scope("bidaf"): bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob) b = bidaf_object.build_graph( context_hiddens, question_hiddens, self.context_mask, self.qn_mask) #(batch_size, context_len, hidden_size*8) with tf.variable_scope("self_attn_layer"): SelfAttn_object = SelfAttn(self.FLAGS.hidden_size, self.FLAGS.hidden_size * 2, self.keep_prob, input_size=self.FLAGS.hidden_size * 2) M = SelfAttn_object.build_graph( b, self.context_mask, cell_type="lstm") #(batch_size, context_len, hidden_size*2) #Make prediction with tf.variable_scope('prediction_layer'): #Encode the self-attended context first with tf.variable_scope("final_lstm_layer"): final_lstm_object = RNNEncoder( self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=self.FLAGS.hidden_size * 2) M_prime = final_lstm_object.build_graph( M, self.context_mask, scope="final_lstm") #(batch_size, context_len, h*2) #Get start distribution with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( M_prime, self.context_mask) #both are (batch_size, context_len) with tf.variable_scope("EndDist"): logit_start_expand = tf.expand_dims( self.logits_start, axis=2) #(batch_size, context_len, 1) blended_end_rnn_input = tf.concat( [logit_start_expand, M_prime], axis=2) #(batch_size, context_len, hidden_size*2) end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, direction="unidirectional") end_rnn_output = end_dist_rnn.build_graph( blended_end_rnn_input, self.context_mask, scope="end_dist_rnn") # Get the end dist softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( end_rnn_output, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.self_attention: encoder = RNNEncoder(self.FLAGS.hidden_size_encoder, self.keep_prob) else: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states if self.FLAGS.simple_attention: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) if self.FLAGS.co_attention: #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space question_hiddens_t = tf.transpose( question_hiddens, perm=[0, 2, 1]) #(batch_size,hidden_size*2,question_len) trans_question_hiddens_t = tf.contrib.layers.fully_connected( question_hiddens_t, num_outputs=self.FLAGS.question_len, activation_fn=tf.nn.tanh ) #(batch_size,hidden_size*2,question_len) trans_question_hiddens = tf.transpose( trans_question_hiddens_t, perm=[0, 2, 1]) #(batch_size,question_len,hidden_size*2) #Computing the coattention context co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) co_attn_output = co_attn_layer.build_graph( trans_question_hiddens, self.qn_mask, self.context_mask, context_hiddens) #(batch_size,context_len,6*hidden_size) # performing the fusion of temporal information to the coattention context via a bidirectional GRU with tf.variable_scope("co-attn-encoder"): co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = co_attn_encoder.build_graph( co_attn_output, self.context_mask) if self.FLAGS.self_attention: # implemrntation of self attention of the rnet paper self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder, self.FLAGS.hidden_size_qp, self.FLAGS.hidden_size_pp, self.keep_prob) v_p = self_attention_encoder.build_graph_qp( context_hiddens, question_hiddens, self.context_mask, self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len) h_p = self_attention_encoder.build_graph_pp( context_hiddens, question_hiddens, self.context_mask, self.qn_mask, v_p, self.FLAGS.context_len, self.FLAGS.question_len) blended_reps_final = tf.concat( [context_hiddens, v_p, h_p], axis=2) #(batch_size,context_len,5*hidden_size) if self.FLAGS.answer_pointer: #implementation of answer pointer as used in R-Net paper if self.FLAGS.co_attention: hidden_size_attn = self.FLAGS.hidden_size * 2 elif self.FLAGS.self_attention: hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp else: hidden_size_attn = self.FLAGS.hidden_size answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder, hidden_size_attn, self.FLAGS.question_len, self.keep_prob) p, logits = answer_decoder.build_graph_answer_pointer( question_hiddens, context_hiddens, blended_reps_final, self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask, self.context_mask) self.logits_start = logits[0] self.probdist_start = p[0] self.logits_end = logits[1] self.probdist_end = p[1] if self.FLAGS.simple_softmax: # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ with tf.variable_scope('context_conv1') as scope: context_conv1_filter = truncated_normal_var( name='context_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv1 = tf.nn.conv2d(self.context_character_embs, context_conv1_filter, strides, padding='SAME') context_conv1_bias = zero_var(name='context_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv1_add_bias = tf.nn.bias_add(context_conv1, context_conv1_bias) context_relu_conv1 = tf.nn.relu(context_conv1_add_bias) pool_size = [1, 1, 2, 1] context_pool1 = tf.nn.max_pool(context_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer1') with tf.variable_scope('context_conv2') as scope: context_conv2_filter = truncated_normal_var( name='context_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv2 = tf.nn.conv2d(context_pool1, context_conv2_filter, strides, padding='SAME') context_conv2_bias = zero_var(name='context_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv2_add_bias = tf.nn.bias_add(context_conv2, context_conv2_bias) context_relu_conv2 = tf.nn.relu(context_conv2_add_bias) pool_size = [1, 1, 3, 1] context_pool2 = tf.nn.max_pool(context_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer2') with tf.variable_scope('context_conv3') as scope: context_conv3_filter = truncated_normal_var( name='context_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv3 = tf.nn.conv2d(context_pool2, context_conv3_filter, strides, padding='SAME') context_conv3_bias = zero_var(name='context_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv3_add_bias = tf.nn.bias_add(context_conv3, context_conv3_bias) context_relu_conv3 = tf.nn.relu(context_conv3_add_bias) pool_size = [1, 1, 4, 1] context_pool3 = tf.nn.max_pool(context_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer3') context_flattened_layer = tf.reshape( context_pool3, [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,300,192 context_final = tf.concat([self.context_embs, context_flattened_layer], axis=2) with tf.variable_scope('qn_conv1') as scope: qn_conv1_filter = truncated_normal_var( name='qn_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv1 = tf.nn.conv2d(self.qn_character_embs, qn_conv1_filter, strides, padding='SAME') qn_conv1_bias = zero_var(name='qn_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias) qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias) pool_size = [1, 1, 2, 1] qn_pool1 = tf.nn.max_pool(qn_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer1') with tf.variable_scope('qn_conv2') as scope: qn_conv2_filter = truncated_normal_var( name='qn_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv2 = tf.nn.conv2d(qn_pool1, qn_conv2_filter, strides, padding='SAME') qn_conv2_bias = zero_var(name='qn_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias) qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias) pool_size = [1, 1, 3, 1] qn_pool2 = tf.nn.max_pool(qn_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer2') with tf.variable_scope('qn_conv3') as scope: qn_conv3_filter = truncated_normal_var( name='qn_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv3 = tf.nn.conv2d(qn_pool2, qn_conv3_filter, strides, padding='SAME') qn_conv3_bias = zero_var(name='qn_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias) qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias) pool_size = [1, 1, 3, 1] qn_pool3 = tf.nn.max_pool(qn_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer3') qn_flattened_layer = tf.reshape( qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,30,128 qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2) encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) print("context_final final shape %s" % (context_final.get_shape())) print("context_mask final shape %s" % (self.context_mask.get_shape())) print("qn_final final shape %s" % (qn_final.get_shape())) print("qn_mask final shape %s" % (self.qn_mask.get_shape())) context_hiddens = encoder.build_graph( context_final, self.context_mask) # (batch_size, context_len, hidden_size*2+192) question_hiddens = encoder.build_graph( qn_final, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) #print("blended_reps shape %s" % (blended_reps.get_shape())) #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1') #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask) #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2') #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*8) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder1") blended_reps_thro_model_layer1 = model_encoder1.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder2") blended_reps_thro_model_layer2 = model_encoder2.build_graph( blended_reps_thro_model_layer1, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder3") blended_reps_thro_model_layer3 = model_encoder3.build_graph( blended_reps_thro_model_layer2, self.context_mask) # (batch_size, context_len, hidden_size*2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final = tf.contrib.layers.fully_connected( blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh) # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') unmasked_affinity = tf.einsum( 'ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, self.document_length) attention_p = tf.nn.softmax(affinity, dim=1) unmasked_affinity_t = tf.transpose( unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, self.question_length) attention_q = tf.nn.softmax(affinity_t, dim=1) summary_q = tf.einsum( 'ndh,ndq->nqh', context_hiddens, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum( 'nqh,nqd->ndh', question_variation, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) context2 = encoder1.build_graph( summary_d, self.context_mask) # (batch_size, context_len, hidden_size*2) question2 = encoder1.build_graph( summary_q, self.qn_mask) # (batch_size, question_len, hidden_size*2) unmasked_affinity1 = tf.einsum( 'ndh,nqh->ndq', context2, question2) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity1 = maybe_mask_affinity(unmasked_affinity1, self.document_length) attention_p1 = tf.nn.softmax(affinity1, dim=1) unmasked_affinity_t1 = tf.transpose( unmasked_affinity1, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1, self.question_length) attention_q1 = tf.nn.softmax(affinity_t1, dim=1) summary_q1 = tf.einsum( 'ndh,ndq->nqh', context2, attention_p1) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d1 = tf.einsum( 'nqh,nqd->ndh', question2, attention_q1) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1) # In[] document_representations = [ context_hiddens, # E^D_1 context2, # E^D_2 summary_d, # S^D_1 summary_d1, # S^D_2 coattention_d, # C^D_1 coattention_d1, # C^D_2 ] document_representation = tf.concat(document_representations, 2) encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob) U = encoder2.build_graph(document_representation, self.context_mask) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, U], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ if self.FLAGS.use_char_cnn: with vs.variable_scope('char_encoding'): self.context_char_encodings, self.cnn_filters1 = char_encoder2( self.context_char_embs, self.FLAGS.context_len, self.FLAGS.word_len, self.FLAGS.cnn_filter_width, self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters) #self.context_char_encodings = tf.nn.dropout(self.context_char_encodings, self.keep_prob) tf.get_variable_scope().reuse_variables() self.qn_char_encodings, self.cnn_filters2 = char_encoder2( self.qn_char_embs, self.FLAGS.question_len, self.FLAGS.word_len, self.FLAGS.cnn_filter_width, self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters) #self.qn_char_encodings = tf.nn.dropout(self.qn_char_encodings, self.keep_prob) joined_context_embs = tf.concat( [self.context_embs, self.context_char_encodings], axis=2) joined_qn_embs = tf.concat([self.qn_embs, self.qn_char_encodings], axis=2) assert joined_context_embs.shape[ 2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters assert joined_qn_embs.shape[ 2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters else: joined_context_embs = self.context_embs joined_qn_embs = self.qn_embs # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope('embedding_layer'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( joined_context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( joined_qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) attn_layer = BDAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask, q2c=self.FLAGS.use_q2c_attention ) # attn_output is shape (batch_size, context_len, hidden_size*6) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) # Use context hidden states to attend to question hidden states #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) with vs.variable_scope('layer1'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) layer1_reps = encoder.build_graph(blended_reps, self.context_mask) with vs.variable_scope('layer2'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) layer2_reps = encoder.build_graph(layer1_reps, self.context_mask) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default final_reps = tf.contrib.layers.fully_connected( layer2_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) #final_reps = layer2_reps # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_start.build_graph( final_reps, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #end_hiddens = encoder.build_graph(final_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) softmax_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_end.build_graph( final_reps, self.context_mask)
def build_graph_middle(self,new_attn,attn_output,context_hiddens,question_hiddens): matrix_dimensions_answer = context_hiddens.get_shape().as_list() batch_size_answer,matrix_size_answer,hidden_size_answer = matrix_dimensions_answer[0],matrix_dimensions_answer[1],matrix_dimensions_answer[2] matrix_dimensions_question = question_hiddens.get_shape().as_list() batch_size_question,matrix_size_question,hidden_size_question = matrix_dimensions_question[0],matrix_dimensions_question[1],matrix_dimensions_question[2] print(matrix_dimensions_answer,matrix_dimensions_question) ##time.sleep(100) #Add attention over attention code print("question",question_hiddens.get_shape().as_list()) print("pargraph",context_hiddens.get_shape().as_list()) print("attention matrix",new_attn.get_shape().as_list()) P2Q = tf.nn.softmax(new_attn,1) #(batch,paragraph,questions) QTilda = tf.matmul(P2Q,question_hiddens) #(batch,paragraph,hidden*2) same as paragraph Q2P = tf.nn.softmax(new_attn,2) Q2PTranspose = tf.transpose(Q2P,perm=[0,2,1]) PTilda = tf.matmul(Q2PTranspose,context_hiddens) #(batch,question,hidden*2) same as question print("P2Q",P2Q.get_shape().as_list()) print("QTilda",QTilda.get_shape().as_list()) print("Q2P",Q2P.get_shape().as_list()) print("PTilda",PTilda.get_shape().as_list()) #Fusion layer below #variable_temp = self.Fuse(QTilda,context_hiddens,"paragraphGate","paragraphMatch",context_hiddens) #print(variable_temp.get_shape().as_list()) print("AAA") ##time.sleep(100) paragraphNew = self.Fuse(QTilda,context_hiddens,"paragraphGate","paragraphMatchYOYO",context_hiddens) #(batch,paragraph,hidden) paragraphNew.set_shape([None,matrix_size_answer,hidden_size_answer]) questionNew = self.Fuse(PTilda,question_hiddens,"questionGate","questionMatch",question_hiddens) #(batch,question,hidden) questionNew.set_shape([None,matrix_size_question,hidden_size_question]) ##time.sleep(100) #paragraphNew = tf.Print(paragraphNew,[tf.shape(paragraphNew)]) #questionNew = tf.Print(questionNew,[tf.shape(questionNew)]) print(paragraphNew) print(questionNew) ##time.sleep(100) #paragraphNewMask = tf.placeholder(tf.int32, shape=[None, 1]) #questionNewMask = tf.placeholder(tf.int32, shape=[None, 1]) encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) encoder2Q = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens_new = encoder2.build_graph(paragraphNew, self.context_mask,"rnnencoder2") #(batch,paragraph,context_len) question_hiddens_new = encoder2Q.build_graph(questionNew, self.qn_mask,"rnnencoder2Q") #(batch,question,context_len) #context_hiddens_new = paragraphNew #question_hiddens_new = questionNew #context_hiddens_new = tf.Print(context_hiddens_new,[tf.shape(context_hiddens_new)]) #question_hiddens_new = tf.Print(question_hiddens_new,[tf.shape(question_hiddens_new)]) print(context_hiddens_new.get_shape().as_list()) print("****") ####time.sleep(100) matrix_dimensions = tf.shape(context_hiddens) batch_size,matrix_size,hidden_size = matrix_dimensions[0],matrix_dimensions[1],matrix_dimensions[2] #Second fusing layer and softmax layer #New learnable matrix W1 = tf.get_variable("W1",shape=[matrix_size_answer,matrix_size_answer],trainable=True) #(matrix_size,matrix_size) #paragraphNewReshape = tf.reshape(context_hiddens_new,[batch_size*matrix_size,hidden_size]) paragraphNewTranspose = tf.transpose(context_hiddens_new,perm=[0,2,1]) paragraphNewReshape = tf.reshape(paragraphNewTranspose,[batch_size*hidden_size,matrix_size]) #(B*H,P) paragraphTempRep = tf.matmul(paragraphNewReshape,W1) #(B*H,P) paragraphTempRep2 = tf.reshape(paragraphTempRep,[batch_size,hidden_size,matrix_size]) paragraphTempRep3 = tf.matmul(paragraphTempRep2,context_hiddens_new) paragraphTempSoftmax = tf.nn.softmax(paragraphTempRep3) #(batch,hidden_size,hidden_size) paragraphSelfAllign = tf.matmul(paragraphTempSoftmax,tf.transpose(context_hiddens_new,perm=[0,2,1])) paragraphContextual = self.Fuse(tf.transpose(paragraphSelfAllign,perm=[0,2,1]),context_hiddens_new,"paragraphGate2","paragraphMatch2",context_hiddens) #(batch,pargraph,hidden) print(paragraphContextual.get_shape().as_list()) #time.sleep(100) #paragraphContextual = tf.Print(paragraphContextual,[tf.shape(paragraphContextual)]) ''' batch_size2,matrix_size2,hidden_size2 = matrix_dimensions2[0],matrix_dimensions2[1],matrix_dimensions2[2] matrix_dimensions2 = tf.shape(context_hiddens_new) questionNewReshape = tf.reshape(question_hiddens_new,[batch_size2*matrix_size2,hidden_size2]) questionTempRep = tf.matmul(tf.matmul(questionNewReshape,W1)) questionTempRep2 = tf.reshape(questionTempRep,[batch_size2,matrix_size2,hidden_size2]) questionTempRep3 = tf.matmul(questionTempRep2,tf.transpose(question_hiddens_new,dim=[0,2,1])) questionTempSoftmax = tf.nn.softmax(questionTempRep3) questionSelfAllign = tf.matmul(questionTempSoftmax,question_hiddens_new) ''' encoder3 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #pargraphContextualMask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len]) paragraphContextual.set_shape([batch_size_answer,matrix_size_answer,hidden_size_answer]) print(batch_size_answer,matrix_size_answer,hidden_size_answer) print(self.context_mask.get_shape().as_list()) #time.sleep(100) paragraphContextual = paragraphContextual #paragraphContextual=encoder3.build_graph(paragraphContextual, self.context_mask,"rnnencoder3") #(batch,paragraph,context_len) #Code to represent question matrix_dimensions2 = tf.shape(question_hiddens) batch_size2,matrix_size2,hidden_size2 = matrix_dimensions2[0],matrix_dimensions2[1],matrix_dimensions2[2] #encoder4 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #questionSelfAllignMask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len]) questionSelfAllign = question_hiddens_new #encoder4.build_graph(question_hiddens_new, self.qn_mask,"rnnencoder4") #(batch,question,H) Wq = tf.get_variable("Wq",shape=[1,question_hiddens.get_shape().as_list()[2]],trainable=True) #(1,h) questionSelfAllignTranspose = tf.transpose(questionSelfAllign,perm=[2,0,1]) questionSelfAllignReshape = tf.reshape(questionSelfAllignTranspose,[hidden_size2,matrix_size2*batch_size2]) GammaTemp = tf.matmul(Wq,questionSelfAllignReshape) GammaTemp2 = tf.reshape(GammaTemp,[batch_size2,1,matrix_size2]) Gamma = tf.nn.softmax(GammaTemp2) #(batch,1,question) questionContextual = tf.matmul(Gamma,questionSelfAllign) #(batch,1,hidden) print(questionContextual.get_shape().as_list()) ###time.sleep(100) #For start point of answer WeightSoftmaxStart = tf.get_variable("WeightSoftmaxStart",[question_hiddens.get_shape().as_list()[2],question_hiddens.get_shape().as_list()[2]],trainable=True) questionTranspose = tf.transpose(questionContextual,perm=[0,2,1]) questionContextualReshape = tf.reshape(questionTranspose,[batch_size,hidden_size]) tempMatrixMult1 = tf.matmul(questionContextualReshape,WeightSoftmaxStart) tempMatrixMult1Reshape = tf.reshape(tempMatrixMult1,[batch_size,1,hidden_size]) probStartMatrix = tf.matmul(tempMatrixMult1Reshape,tf.transpose(paragraphContextual,perm=[0,2,1])) #(b,1,n) ''' paragraphContextualTranspose = tf.reshape(paragraphContextual,[batch_size*matrix_size,hidden_size]) tempMatrixMult1 = tf.matmul(paragraphContextualTranspose,WeightSoftmaxStart) tempMatrixMult1Reshape = tf.reshape(tempMatrixMult1,[batch_size,matrix_size,1]) probStartMatrix = tf.matmul(tempMatrixMult1Reshape,questionContextual) #(batch,pargraph,context) ''' #For end point of answer WeightSoftmaxEnd = tf.get_variable("WeightSoftmaxEnd",[question_hiddens.get_shape().as_list()[2],question_hiddens.get_shape().as_list()[2]],trainable=True) #questionTranspose = tf.transpose(questionContextual,perm=[0,2,1]) #questionContextualReshape = tf.reshape(questionTranspose,[batch_size,hidden_size]) tempMatrixMult2 = tf.matmul(questionContextualReshape,WeightSoftmaxEnd) tempMatrixMult1Reshape2 = tf.reshape(tempMatrixMult2,[batch_size,1,hidden_size]) probEndMatrix = tf.matmul(tempMatrixMult1Reshape2,tf.transpose(paragraphContextual,perm=[0,2,1])) #(b,1,n) print(probStartMatrix.get_shape().as_list()) print(probEndMatrix.get_shape().as_list()) print("**************") probStartMatrix = tf.reshape(probStartMatrix,[batch_size,matrix_size]) probEndMatrix = tf.reshape(probEndMatrix,[batch_size,matrix_size]) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) return probStartMatrix,probEndMatrix,blended_reps_final
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print("Building Pointer Model") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states # BIDAG LAYER bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*6) bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h #SELF ATTENTION LAYER self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size) _, self_attn_output = self_attn_layer.build_graph( bidaf_output, self.context_mask) # batch_size, context_len, 8 * hidden_size # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*16) self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="AttentionEncoder") blended_reps = self_attention_encoder.build_graph( blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2 # MODELING LAYER modeling_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder") modeling_output = modeling_encoder.build_graph(blended_reps, self.context_mask) modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder2") modeling_output_two = modeling_encoder_two.build_graph( modeling_output, self.context_mask) total_reps_start = tf.concat([blended_reps, modeling_output], axis=2) total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2) # OUTPUT LAYER with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( total_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( total_reps_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. context_input_lens = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.context_char_ids, tf.bool), tf.int32), axis=2), [-1]) qn_input_lens = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qn_char_ids, tf.bool), tf.int32), axis=2), [-1]) cell_fw = rnn_cell.GRUCell(self.FLAGS.hidden_size) cell_bw = rnn_cell.GRUCell(self.FLAGS.hidden_size) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.context_char_embs, context_input_lens, dtype=tf.float32) ch_emb = tf.reshape( tf.concat([state_fw, state_bw], axis=1), [-1, self.FLAGS.context_len, 2 * self.FLAGS.hidden_size]) self.context_embs = tf.concat([self.context_embs, ch_emb], axis=2) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.qn_char_embs, qn_input_lens, dtype=tf.float32) qh_emb = tf.reshape( tf.concat([state_fw, state_bw], axis=1), [-1, self.FLAGS.question_len, 2 * self.FLAGS.hidden_size]) self.qn_embs = tf.concat([self.qn_embs, qh_emb], axis=2) # ToDo Deep encoder encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, cell_type=self.FLAGS.cell_type) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) elif self.FLAGS.cell_type == 'qanet': encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \ num_heads=self.FLAGS.emb_num_heads, \ filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \ keep_prob=self.keep_prob, input_mapping=True) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) if self.FLAGS.attention == 'basic': # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) elif self.FLAGS.attention == 'bidaf': attn_layer = BiDAFAttn(self.keep_prob) blended_reps = attn_layer.build_graph(context_hiddens, self.context_mask, question_hiddens, self.qn_mask) if self.FLAGS.modeling_layer == 'basic': # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size, weights_initializer=initializer_relu() ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask) elif self.FLAGS.modeling_layer == 'rnn': encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m1') m1 = encoder_start.build_graph(blended_reps, self.context_mask) encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m2') m2 = encoder_end.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([blended_reps, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([blended_reps, m2], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet': modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder') m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder.build_graph(m0, self.context_mask) m2 = modeling_encoder.build_graph(m1, self.context_mask) m3 = modeling_encoder.build_graph(m2, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m1, m2], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m1, m3], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet2': modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder1') ''' modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder2') ''' m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder1.build_graph(m0, self.context_mask) m2 = modeling_encoder1.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m0, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m0, m2], -1), self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # R-NET: gated attention layer #gated_attn_layer = GatedAttn(self.keep_prob) #context_hiddens = gated_attn_layer.build_graph(question_hiddens, context_hiddens, self.qn_mask) # Use context hidden states to attend to question hidden states #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Use bidirectional attention flow biDAF_layer = BidirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) C2Q, Q2C = biDAF_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # Concat output_l, output_r to context_hiddens to get blended_reps #blended_reps = tf.concat([context_hiddens, C2Q, Q2C], axis=2) # (batch_size, context_len, hidden_size*6) blended_reps = tf.concat( [ context_hiddens, C2Q, context_hiddens * C2Q, context_hiddens * Q2C ], axis=2) # (batch_size, context_len, hidden_size*8) # Modeling layer for BiDAF biDAF_modeling_layer = ModelingForBiDAF(self.FLAGS.hidden_size, self.keep_prob) modeling_output = biDAF_modeling_layer.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) #encoder = RNNEncoderV2(self.FLAGS.hidden_size, self.keep_prob) #context_fw, context_bw = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) #question_fw, question_bw = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) #mpm_layer = BiMPM(self.FLAGS.hidden_size, self.keep_prob) #mpm_reps, _ = mpm_layer.build_graph(context_fw, context_bw, question_fw, question_bw, self.context_mask, self.qn_mask) #self.matching_scores = mpm_reps # (batch_size, context_len, 8*MP_dim) #mpm_layer = BiMPM(self.FLAGS.hidden_size*2, self.keep_prob) #mpm_reps, _ = mpm_layer.build_graph(context_hiddens, question_hiddens, self.context_mask, self.qn_mask) #self.matching_scores = mpm_reps # (batch_size, context_len, 8*MP_dim) # Modeling layer for mpm #mpm_modeling_layer = ModelingForBiDAF(self.FLAGS.hidden_size, self.keep_prob) #modeling_output = mpm_modeling_layer.build_graph(mpm_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) # Coattention: #attn_layer = CoAttn(self.FLAGS.hidden_size*2, self.keep_prob) #coattn_output = attn_layer.build_graph(question_hiddens, context_hiddens, self.qn_mask, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) # R-NET: self attention layer #self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*4) #self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*8) #context_hiddens = self_attn_layer.build_graph(blended_reps, self.context_mask) #context_hiddens = self_attn_layer.build_graph(coattn_output, self.context_mask) # R-NET: pointer network output layer #ptr_net = PointerNetwork() #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = \ #ptr_net.build_graph(question_hiddens, context_hiddens, self.qn_mask, self.context_mask) #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = \ #ptr_net.build_graph(question_hiddens, coattn_output, self.qn_mask, self.context_mask) # Dynamic Pointing Decoder output layer dp_decoder = DynamicPointingDecoder(self.keep_prob) #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(coattn_output, self.context_mask) #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(blended_reps, self.context_mask) self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph( modeling_output, self.context_mask) #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(mpm_reps, self.context_mask) """
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("EmbedLayer"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, a, c = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps c = tf.expand_dims(c, 1) blended_reps = tf.concat( [context_hiddens, a, context_hiddens * a, context_hiddens * c], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("startModelLayer"): modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) m1 = modeling_layer_encoder.build_graph(blended_reps, self.context_mask) with vs.variable_scope("endModelLayer"): modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) m2 = modeling_layer_encoder.build_graph(blended_reps, self.context_mask) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): blended_reps_final_start = tf.concat([m1, blended_reps], axis=2) softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): blended_reps_final_end = tf.concat([m2, blended_reps], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print "Running Attention Model with... %s" % self.FLAGS.attention if self.FLAGS.attention == "BiDAF": encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) bidaf_attn_layer = BiDirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, self.FLAGS.question_len, self.FLAGS.context_len) _, context_to_question, _, question_to_context = bidaf_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # Combine attention vectors and hidden context vector context_c2q = tf.multiply(context_hiddens, context_to_question) context_q2c = tf.multiply(context_hiddens, question_to_context) blended_reps = tf.concat( [ context_hiddens, context_to_question, context_c2q, context_q2c ], axis=2) # (batch_size, context_len, hidden_size*8) # Modeling Layers (2 layers of bidirectional LSTM) encodes the query-aware representations of context words. modeling_layer = BiRNN(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1 = modeling_layer.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2). modeling_layer_2 = BiRNN2(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = modeling_layer_2.build_graph( blended_reps_1, self.context_mask) # (batch_size, context_len, hidden_size*2). else: # Default: self.FLAGS.attention == "BasicAttn" encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)