def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if self.FLAGS.attention_type == 'dot_product': print("<<<<<<<< Adding dot_poduct attention >>>") attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) elif self.FLAGS.attention_type == 'self_attention': print("<<<<<<<<< Adding Self attention over basic attention >>>>>>>") basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.self_attn_zsize, self.FLAGS.hidden_size*2) _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask) concated_basic_self = tf.concat([basic_attn_output,self_attn_output], axis=2) #(bs,N,4h) self_attn_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps = self_attn_encoder.build_graph(concated_basic_self, self.context_mask, scope_name="self_attn_encoder") # (batch_size, N, hidden_size*2) elif self.FLAGS.attention_type == 'bidaf': print("<<<<<<<<< Adding BIDAF attention >>>>>>>") attn_layer = BidafAttn(self.keep_prob, self.FLAGS.hidden_size*2) c2q_attention, q2c_attention = attn_layer.build_graph(context_hiddens, question_hiddens, self.qn_mask, self.context_mask) # Combined tensors o get final output..... body_c2q_attention_mult = context_hiddens*c2q_attention # (batch_size, num_keys(N), 2h) q2c_expanded = tf.expand_dims(q2c_attention, 1) #(bs,1,2h) body_q2c_attention_mult = context_hiddens*q2c_expanded # (batch_size, num_keys(N), 2h) blended_reps = tf.concat([c2q_attention, body_c2q_attention_mult, body_q2c_attention_mult], axis=2) #(bs,N,6h) # context_hiddens removed blended_reps = tf.nn.dropout(blended_reps, self.keep_prob) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("ClassProb"): softmax_layer_class = CustomSimpleSoftmaxLayer() #Both have dimesions: shape (batch_size, 4) self.logits_class, self.probdist_class = softmax_layer_class.build_graph(blended_reps_final, self.context_mask, self.FLAGS.reduction_type)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print("Building SelfAttention Model") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) self.c2q_attn_dist, self.q2c_attn_dist, bidaf_output = bidaf_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*6) bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size) self.self_attn_dist, self_attn_output = self_attn_layer.build_graph(bidaf_output, self.context_mask) # batch_size, context_len, 2 * hidden_size # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*10) self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= 2 * self.FLAGS.num_layers, name="AttentionEncoder") blended_reps = self_attention_encoder.build_graph(blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2 # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default # blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps, self.context_mask) end_pointer = tf.concat([tf.expand_dims(self.probdist_start, -1), blended_reps], axis=2) # end_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= self.FLAGS.num_layers, name="EndEncoder") # end_pointer = end_encoder.build_graph(end_pointer, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(end_pointer, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print("Building Pointer Model") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states # BIDAG LAYER bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*6) bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h #SELF ATTENTION LAYER self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size) _, self_attn_output = self_attn_layer.build_graph( bidaf_output, self.context_mask) # batch_size, context_len, 8 * hidden_size # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*16) self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="AttentionEncoder") blended_reps = self_attention_encoder.build_graph( blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2 # MODELING LAYER modeling_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder") modeling_output = modeling_encoder.build_graph(blended_reps, self.context_mask) modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder2") modeling_output_two = modeling_encoder_two.build_graph( modeling_output, self.context_mask) total_reps_start = tf.concat([blended_reps, modeling_output], axis=2) total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2) # OUTPUT LAYER with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( total_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( total_reps_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) #Applying Self attention to blended_reps self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 4) self_attn_output = self_attn_layer.build_graph(blended_reps, self.context_mask) # Concat self_attn_output to blended_reps to get attn_reps attn_reps = tf.concat([blended_reps, self_attn_output], axis=2) #Feeding attn_reps to a Bidirectional GRU to get h encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, "RNNEncoder2") h = encoder.build_graph(attn_reps, self.context_mask) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( h, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.model == "baseline" : encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn": print("INSIDE the BIDAF model") encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn": encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn": context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "baseline" : attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "coatt" : #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2) context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps_final = attn_output #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): contextLen = tf.reduce_sum(self.context_mask, axis=1) cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size) (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32) U_1 = tf.concat([fw_out, bw_out], axis=2) out = tf.nn.dropout(U_1, self.keep_prob) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask) elif self.FLAGS.model =="bidaf" or self.FLAGS.model=="bidaf_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) if self.FLAGS.model == "bidaf_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): # Concatenate the start logits with the modelling layer output to get the input to the # end word lstm #self.logits_start has a shape of #(batch_size, context_len) logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1) end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2) # LSTM end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob) blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask) blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask) elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) if self.FLAGS.model == "bidaf_dynamic_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # We now feed this to dynamic decoder module coded in Answer decoder # the output of the decoder are start, end, alpha_logits and beta_logits # start and end have a shape of (batch_size, num_iterations) #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim) decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = mod_layer_out[:,0,:] u_e_init = mod_layer_out[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1] elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn": context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) if self.FLAGS.model == "coatt_dynamic_self_attn": CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) attn_output = U #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = attn_output[:,0,:] u_e_init = attn_output[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
def build_graph(self): """ Builds the main part of the graph for the model Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # NOTE CHANGE: concantanate glove and elmo embedding # How to handle elmo context_len and glove context_len mismatch? # Just make the context_ids no max context_len context_embs_concat = tf.concat( [self.elmo_context_input, self.context_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) qn_embs_concat = tf.concat( [self.elmo_question_input, self.qn_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) #set shape so that it can pass to dynamic lstm context_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) qn_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) self.qn_mask.set_shape((None, None)) self.context_mask.set_shape((None, None)) with tf.variable_scope("biLSTM"): Encoder = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=1024 + self.FLAGS.embedding_size) #shared weights (same scope) context_hiddens = Encoder.build_graph( context_embs_concat, self.context_mask, scope="context_question_encoder" ) #(batch_size, context_len, hidden_size*2) question_hiddens = Encoder.build_graph( qn_embs_concat, self.qn_mask, scope="context_question_encoder" ) #(batch_size, question_len, hidden_size*2) with tf.variable_scope("bidaf"): bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob) b = bidaf_object.build_graph( context_hiddens, question_hiddens, self.context_mask, self.qn_mask) #(batch_size, context_len, hidden_size*8) with tf.variable_scope("self_attn_layer"): SelfAttn_object = SelfAttn(self.FLAGS.hidden_size, self.FLAGS.hidden_size * 2, self.keep_prob, input_size=self.FLAGS.hidden_size * 2) M = SelfAttn_object.build_graph( b, self.context_mask, cell_type="lstm") #(batch_size, context_len, hidden_size*2) #Make prediction with tf.variable_scope('prediction_layer'): #Encode the self-attended context first with tf.variable_scope("final_lstm_layer"): final_lstm_object = RNNEncoder( self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=self.FLAGS.hidden_size * 2) M_prime = final_lstm_object.build_graph( M, self.context_mask, scope="final_lstm") #(batch_size, context_len, h*2) #Get start distribution with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( M_prime, self.context_mask) #both are (batch_size, context_len) with tf.variable_scope("EndDist"): logit_start_expand = tf.expand_dims( self.logits_start, axis=2) #(batch_size, context_len, 1) blended_end_rnn_input = tf.concat( [logit_start_expand, M_prime], axis=2) #(batch_size, context_len, hidden_size*2) end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, direction="unidirectional") end_rnn_output = end_dist_rnn.build_graph( blended_end_rnn_input, self.context_mask, scope="end_dist_rnn") # Get the end dist softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( end_rnn_output, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE): if self.FLAGS.use_stacked_encoder: encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob) else: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if self.FLAGS.num_encoding_layers > 1: with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE): encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask) question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask) # Use context hidden states to attend to question hidden states if self.FLAGS.bidaf: attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2) blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8) else: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Self-attention layer if self.FLAGS.self_attend: self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz) blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked # from BiDAF paper) to the hidden states from the attention layer. if self.FLAGS.modeling_layer: with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE): model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1: with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE): model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask) # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob) # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.pointer_network: #TODO: define flag with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE): pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size) (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \ pointer_network.build_graph(blended_reps_final, self.context_mask) else: # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states if self.FLAGS.model == 'bidaf': bidaf_layer = BiDAF(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) g_m, g_m2 = bidaf_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*10) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( g_m, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( g_m2, self.context_mask) elif self.FLAGS.model == 'bicoattn': bicoattn_layer = BiCoattn(self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) g_m, g_m2, self.attn_output = bicoattn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*14) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( g_m, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( g_m2, self.context_mask) elif self.FLAGS.model == 'transformernetwork': transformernetwork_layer = TransformerNetwork( self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, 3, 8) g_m, g_m2 = transformernetwork_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask, self.FLAGS.is_training ) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( g_m, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( g_m2, self.context_mask) elif self.FLAGS.model == 'bctn': bctn_layer = BCTN(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, 2, 8) g_m, g_m2 = bctn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask, self.FLAGS.is_training ) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( g_m, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( g_m2, self.context_mask) elif self.FLAGS.model == 'rnet': with vs.variable_scope("Contextual"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) print "GatedAttn" with vs.variable_scope("GatedAttn"): attn_layer_gated = GatedAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size) context_hiddens_gated, self.a_t = attn_layer_gated.build_graph( question_hiddens, self.qn_mask, context_hiddens) # (batch_size, context_len, hidden_size) print "SelfAttn" with vs.variable_scope("SelfAttn"): attn_layer_self = SelfAttn(self.keep_prob, self.FLAGS.hidden_size, self.FLAGS.hidden_size) attn_output_self, self.attn_output = attn_layer_self.build_graph( context_hiddens_gated, self.context_mask ) # (batch_size, context_len, hidden_size * 2) print "Output" with vs.variable_scope("Output"): output_layer = Output_Rnet(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size) self.logits_start, self.probdist_start, self.logits_end, self.probdist_end, self.a = output_layer.build_graph( attn_output_self, question_hiddens, self.context_mask, self.qn_mask) elif self.FLAGS.model == 'basicattnplusone': attn_layer = BasicAttnPlusOne(self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*4) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask) elif self.FLAGS.model == 'basicattnplustwo': attn_layer = BasicAttnPlusTwo(self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) g_m, g_m2 = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*4) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( g_m, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( g_m2, self.context_mask) else: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*4) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout) self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): self._define_embedding() self.inputs = tf.placeholder( tf.int32, [None, self.max_length]) if self.char_embed: self.char_inputs = tf.placeholder( tf.int32, [None, self.max_length, self.word_length]) self.ex_lengths = tf.placeholder(tf.int32, [None]) # Outputs as usual: self.outputs = tf.placeholder( tf.float32, shape=[None, self.output_dim]) # This converts the inputs to a list of lists of dense vector # representations: self.feats = tf.nn.embedding_lookup( self.embedding, self.inputs) if self.char_embed: # shape (?, max_len, word_len, char_embed_dim) self.char_feats = tf.nn.embedding_lookup( self.char_embedding, self.char_inputs) # shape (? * max_len, word_len, char_embed_dim) self.char_feats = tf.reshape(self.char_feats, (-1, self.word_length, self.char_embed_dim)) self.char_feats = tf.layers.conv1d(self.char_feats, self.num_char_filters, self.char_kernel_size, padding="same") # shape (?, max_len, word_len, num_char_filters) self.char_feats = tf.reshape(self.char_feats, (-1, self.max_length, self.word_length, self.num_char_filters)) # shape (?, max_len, num_char_filters) self.char_feats = tf.reduce_max(self.char_feats, axis=2) # concatenate char embeds with word embeds to get final representation self.feats = tf.concat([self.feats, self.char_feats], 2) # Defines the RNN structure: self.cell = self.cell_class( self.hidden_dim, activation=self.hidden_activation) self.cell = DropoutWrapper(self.cell, self.dropout) # If bidirectional RNN used, define a second RNN cell # alternatively we could shared cells for fw/bw, but i think not for now. if self.bidir_rnn: self.bw_cell = self.cell_class( self.hidden_dim, activation=self.hidden_activation) self.bw_cell = DropoutWrapper(self.bw_cell, self.dropout) if self.stacked and self.bidir_rnn: self.cell2 = self.cell_class( self.hidden_dim, activation=self.hidden_activation) self.cell2 = DropoutWrapper(self.cell2, self.dropout) self.bw_cell2 = self.cell_class( self.hidden_dim, activation=self.hidden_activation) self.bw_cell2 = DropoutWrapper(self.bw_cell2, self.dropout) # Run the RNN: if self.bidir_rnn: with tf.variable_scope("lstm1", reuse=tf.AUTO_REUSE): outputs, output_states = tf.nn.bidirectional_dynamic_rnn( self.cell, self.bw_cell, self.feats, dtype=tf.float32, sequence_length=self.ex_lengths) out = tf.concat(outputs, 1) if self.stacked: with tf.variable_scope("lstm2", reuse=tf.AUTO_REUSE): outputs2, output_states2 = tf.nn.bidirectional_dynamic_rnn( self.cell2, self.bw_cell2, out, dtype=tf.float32, sequence_length=self.ex_lengths) # let the last state be the concatenation of the fw and bw # final ``outputs''. Note that output_states[0] is the FW # (c, h) pair, and output_states[1] is the BW (c, h) pair where # c is the hidden state and h is the output if self.stacked: self.last = tf.concat([output_states2[0][1], output_states2[1][1]], 1) else: if self.cell_class == tf.nn.rnn_cell.LSTMCell: self.last = tf.concat([output_states[0][1], output_states[1][1]], 1) elif self.cell_class == tf.nn.rnn_cell.GRUCell: self.last = tf.concat(output_states, 1) else: outputs, state = tf.nn.dynamic_rnn( self.cell, self.feats, dtype=tf.float32, sequence_length=self.ex_lengths) # Attention Layer if self.self_attend: out = tf.concat(outputs, 1) print(out) self_attn_layer = SelfAttn(self.dropout, out.shape[-1], self.attention_dim) outputs = self_attn_layer.build_graph(out, self.ex_lengths) # How can I be sure that I have found the last true state? This # first option seems to work for all cell types but sometimes # leads to indexing errors and is in general pretty complex: # # self.last = self._get_last_non_masked(outputs, self.ex_lengths) # # This option is more reliable, but is it definitely getting # the final true state? # # Note that we set self.last above for the BiDir RNN case. if not self.bidir_rnn: self.last = self._get_final_state(self.cell, state) # Softmax classifier on the final hidden state: if self.bidir_rnn: self.W_hy = self.weight_init( 2 * self.hidden_dim, self.output_dim, 'W_hy') else: self.W_hy = self.weight_init( self.hidden_dim, self.output_dim, 'W_hy') self.b_y = self.bias_init(self.output_dim, 'b_y') self.model = tf.matmul(self.last, self.W_hy) + self.b_y
def build_graph(): def bilm_build_graph(options_file, weight_file): # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file) # Get ops to compute the LM embeddings. context_embeddings_op = bilm(context_elmo) question_embeddings_op = bilm(question_elmo) # Get an op to compute ELMo (weighted average of the internal biLM layers) # Our SQuAD model includes ELMo at both the input and output layers # of the task GRU, so we need 4x ELMo representations for the question # and context at each of the input and output. # We use the same ELMo weights for both the question and context # at each of the input and output. elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)['weighted_op'] with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_input = weight_layers('input', question_embeddings_op, l2_coef=0.0)['weighted_op'] """ elmo_context_output = weight_layers( 'output', context_embeddings_op, l2_coef=0.0 )['weighted_op'] with tf.variable_scope('', reuse=True): # the reuse=True scope reuses weights from the context for the question elmo_question_output = weight_layers( 'output', question_embeddings_op, l2_coef=0.0 ) """ return elmo_context_input, elmo_question_input def add_embedding_layer(emb_matrix): with tf.variable_scope("embeddings"): #set to constant so its untrainable embedding_matrix = tf.constant( emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size) # Get the word embeddings for the context and question, context_embs = tf.nn.embedding_lookup(embedding_matrix, context_ids) qn_embs = tf.nn.embedding_lookup(embedding_matrix, qn_ids) return context_embs, qn_embs #start the main graph context_embs, qn_embs = add_embedding_layer(emb_matrix) elmo_context_input, elmo_question_input = bilm_build_graph( options_file, weight_file) context_embs_concat = tf.concat([elmo_context_input, context_embs], 2) #(2, context_len, 1074) qn_embs_concat = tf.concat([elmo_question_input, qn_embs], 2) #(2, question_len, 1074) context_embs_concat.set_shape((None, None, 1074)) qn_embs_concat.set_shape((None, None, 1074)) qn_mask.set_shape((None, None)) context_mask.set_shape((None, None)) with tf.variable_scope("biLSTM"): print("Starting biLSTM...") LSTMencoder_context = RNNEncoder(hidden_size, keep_prob=keep_prob, cell_type="lstm", input_size=1074) LSTMencoder_question = RNNEncoder(hidden_size, keep_prob=keep_prob, cell_type="lstm", input_size=1074) #shared weights context_hiddens = LSTMencoder_context.build_graph( context_embs_concat, context_mask, scope="context_question_encoder", reuse=False) question_hiddens = LSTMencoder_question.build_graph( qn_embs_concat, qn_mask, scope="context_question_encoder", reuse=True) with tf.variable_scope("bidaf_layer"): print("Starting bidaf...") bidaf_object = Bidaf(hidden_size * 2, keep_prob) b = bidaf_object.build_graph(context_hiddens, question_hiddens, context_mask, qn_mask) with tf.variable_scope("self_attn_layer"): SelfAttn_object = SelfAttn(hidden_size, hidden_size * 2, keep_prob, input_size=hidden_size * 2) M = SelfAttn_object.build_graph( b, context_mask, cell_type="lstm") #(batch_size, context_len, hidden_size*2) with tf.variable_scope("final_lstm_layer"): final_lstm_object = RNNEncoder(hidden_size, keep_prob=keep_prob, cell_type="lstm", input_size=hidden_size * 2) M_prime = final_lstm_object.build_graph(M, context_mask, scope="final_lstm", reuse=False) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() logits_start, probdist_start = softmax_layer_start.build_graph( M_prime, context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() logits_end, probdist_end = softmax_layer_end.build_graph( M_prime, context_mask) return logits_start, probdist_start, logits_end, probdist_end