def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states if self.attention =='Baseline': attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) if self.attention=='BiDAF': attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output_c2q, attn_output_q2c= attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output_c2q is shape (batch_size, context_len, hidden_size*2) attn_output_q2c is shape (batch_size, 1,hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output_c2q,context_hiddens*attn_output_c2q,context_hiddens*attn_output_q2c], axis=2) # (batch_size, context_len, hidden_size*8) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into cross entropy function. self.pdist_start, self.pdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Apply EncoderBlock for the stacked embedding encoder layer with tf.variable_scope("StackedEmbeddingEncoder"): emb_encoder = EncoderBlock(self.flags.num_blocks_enc, self.keep_prob, self.flags.kernel_size_enc, self.flags.d_model, self.flags.num_conv_enc, self.flags.num_heads, self.flags.d_ff, l2_lambda=self.flags.l2_lambda) c_enc = emb_encoder.build_graph(self.c_embs, self.c_longest, self.c_mask, reduce_input_dim=True, reuse=None) q_enc = emb_encoder.build_graph(self.q_embs, self.q_longest, self.q_mask, reduce_input_dim=True, reuse=True) # Apply bidirectional attention for the context-query attention layer with tf.variable_scope("ContextQueryAttention"): bidaf = BiDAFAttn(self.keep_prob, l2_lambda=self.flags.l2_lambda) # Shape: [batch_size, context_len, vec_size*8]. attn_outputs = bidaf.build_graph(c_enc, self.c_mask, self.c_longest, q_enc, self.q_mask, self.q_longest) # Apply EncoderBlock x3 for the modeling layer with tf.variable_scope("ModelEncoder"): model_encoder = EncoderBlock(self.flags.num_blocks_mod, self.keep_prob, self.flags.kernel_size_mod, self.flags.d_model, self.flags.num_conv_mod, self.flags.num_heads, self.flags.d_ff, l2_lambda=self.flags.l2_lambda) model_1 = model_encoder.build_graph(attn_outputs, self.c_longest, self.c_mask, reduce_input_dim=True) model_2 = model_encoder.build_graph(model_1, self.c_longest, self.c_mask, reuse=True) model_3 = model_encoder.build_graph(model_2, self.c_longest, self.c_mask, reuse=True) # Use a simple softmax output layer to compute start and end probability distributions with tf.variable_scope("Output"): with tf.variable_scope("StartDistribution"): start_inputs = tf.concat([model_1, model_2], axis=-1) softmax_layer_start = SimpleSoftmaxLayer( l2_lambda=self.flags.l2_lambda) self.logits_start, self.pdist_start = softmax_layer_start.build_graph( start_inputs, self.c_mask) with tf.variable_scope("EndDistribution"): end_inputs = tf.concat([model_1, model_3], axis=-1) softmax_layer_end = SimpleSoftmaxLayer( l2_lambda=self.flags.l2_lambda) self.logits_end, self.pdist_end = softmax_layer_end.build_graph( end_inputs, self.c_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, cell_type=self.FLAGS.cell_type) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) elif self.FLAGS.cell_type == 'qanet': encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \ num_heads=self.FLAGS.emb_num_heads, \ filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \ keep_prob=self.keep_prob, input_mapping=True) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) if self.FLAGS.attention == 'basic': # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) elif self.FLAGS.attention == 'bidaf': attn_layer = BiDAFAttn(self.keep_prob) blended_reps = attn_layer.build_graph(context_hiddens, self.context_mask, question_hiddens, self.qn_mask) if self.FLAGS.modeling_layer == 'basic': # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size, weights_initializer=initializer_relu() ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask) elif self.FLAGS.modeling_layer == 'rnn': encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m1') m1 = encoder_start.build_graph(blended_reps, self.context_mask) encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m2') m2 = encoder_end.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([blended_reps, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([blended_reps, m2], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet': modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder') m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder.build_graph(m0, self.context_mask) m2 = modeling_encoder.build_graph(m1, self.context_mask) m3 = modeling_encoder.build_graph(m2, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m1, m2], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m1, m3], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet2': modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder1') ''' modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder2') ''' m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder1.build_graph(m0, self.context_mask) m2 = modeling_encoder1.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m0, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m0, m2], -1), self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ with tf.variable_scope('context_conv1') as scope: context_conv1_filter = truncated_normal_var( name='context_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv1 = tf.nn.conv2d(self.context_character_embs, context_conv1_filter, strides, padding='SAME') context_conv1_bias = zero_var(name='context_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv1_add_bias = tf.nn.bias_add(context_conv1, context_conv1_bias) context_relu_conv1 = tf.nn.relu(context_conv1_add_bias) pool_size = [1, 1, 2, 1] context_pool1 = tf.nn.max_pool(context_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer1') with tf.variable_scope('context_conv2') as scope: context_conv2_filter = truncated_normal_var( name='context_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv2 = tf.nn.conv2d(context_pool1, context_conv2_filter, strides, padding='SAME') context_conv2_bias = zero_var(name='context_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv2_add_bias = tf.nn.bias_add(context_conv2, context_conv2_bias) context_relu_conv2 = tf.nn.relu(context_conv2_add_bias) pool_size = [1, 1, 3, 1] context_pool2 = tf.nn.max_pool(context_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer2') with tf.variable_scope('context_conv3') as scope: context_conv3_filter = truncated_normal_var( name='context_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv3 = tf.nn.conv2d(context_pool2, context_conv3_filter, strides, padding='SAME') context_conv3_bias = zero_var(name='context_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv3_add_bias = tf.nn.bias_add(context_conv3, context_conv3_bias) context_relu_conv3 = tf.nn.relu(context_conv3_add_bias) pool_size = [1, 1, 4, 1] context_pool3 = tf.nn.max_pool(context_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer3') context_flattened_layer = tf.reshape( context_pool3, [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,300,192 context_final = tf.concat([self.context_embs, context_flattened_layer], axis=2) with tf.variable_scope('qn_conv1') as scope: qn_conv1_filter = truncated_normal_var( name='qn_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv1 = tf.nn.conv2d(self.qn_character_embs, qn_conv1_filter, strides, padding='SAME') qn_conv1_bias = zero_var(name='qn_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias) qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias) pool_size = [1, 1, 2, 1] qn_pool1 = tf.nn.max_pool(qn_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer1') with tf.variable_scope('qn_conv2') as scope: qn_conv2_filter = truncated_normal_var( name='qn_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv2 = tf.nn.conv2d(qn_pool1, qn_conv2_filter, strides, padding='SAME') qn_conv2_bias = zero_var(name='qn_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias) qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias) pool_size = [1, 1, 3, 1] qn_pool2 = tf.nn.max_pool(qn_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer2') with tf.variable_scope('qn_conv3') as scope: qn_conv3_filter = truncated_normal_var( name='qn_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv3 = tf.nn.conv2d(qn_pool2, qn_conv3_filter, strides, padding='SAME') qn_conv3_bias = zero_var(name='qn_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias) qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias) pool_size = [1, 1, 3, 1] qn_pool3 = tf.nn.max_pool(qn_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer3') qn_flattened_layer = tf.reshape( qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,30,128 qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2) encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) print("context_final final shape %s" % (context_final.get_shape())) print("context_mask final shape %s" % (self.context_mask.get_shape())) print("qn_final final shape %s" % (qn_final.get_shape())) print("qn_mask final shape %s" % (self.qn_mask.get_shape())) context_hiddens = encoder.build_graph( context_final, self.context_mask) # (batch_size, context_len, hidden_size*2+192) question_hiddens = encoder.build_graph( qn_final, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) #print("blended_reps shape %s" % (blended_reps.get_shape())) #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1') #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask) #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2') #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*8) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder1") blended_reps_thro_model_layer1 = model_encoder1.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder2") blended_reps_thro_model_layer2 = model_encoder2.build_graph( blended_reps_thro_model_layer1, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder3") blended_reps_thro_model_layer3 = model_encoder3.build_graph( blended_reps_thro_model_layer2, self.context_mask) # (batch_size, context_len, hidden_size*2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final = tf.contrib.layers.fully_connected( blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # *********************** # *** Highway network *** # *********************** highway_net = HighWayNetwork() self.context_embs = highway_net.build_graph(self.context_embs) self.qn_embs = highway_net.build_graph(self.qn_embs) # ********************************** # *** Contextual Embedding layer *** # ********************************** # Use a biLSTM to get hidden states for the context and the question # Note: here the biLSTMEncoder is shared (i.e. the weights are the same) between the context and the question. # biLSTM encoding utilizes contextual clues from surrounding words to refine the embeddings. encoder = biLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # **************************** # *** Attention Flow layer *** # **************************** # Couples query and context vectors and produces a set of query-aware feature vectors for ea. word in the document attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, c2q_attn_output, _, q2c_attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # c2q_attn_output is shape (batch_size, context_len, 2h), q2c_attn_output is (batch_size, 1, 2h) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, c2q_attn_output, tf.multiply(context_hiddens, c2q_attn_output), tf.multiply(context_hiddens, q2c_attn_output)], axis=2, name="blended_reps") # (batch_size, context_len, hidden_size*8) # ********************** # *** Modeling layer *** # ********************** # Scans the context Modeling_layer = Modeling_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = Modeling_layer.build_graph(blended_reps) # ******************** # *** Output layer *** # ******************** # Provide an answer to the query # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_final], 2), self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) encoder_out = Output_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final_hiddens = encoder_out.build_graph(blended_reps_final) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_final_hiddens], 2), self.context_mask)
def build_graph(self, multi_lstm=False, bidaf=False): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if multi_lstm is False: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) else: encoder = MultiLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if bidaf is False: # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) else: attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) c2q_attn, q2c_attn = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) q2c_attn = q2c_attn + tf.zeros( shape=[1, c2q_attn.shape[1], c2q_attn.shape[2]]) print(q2c_attn.shape, c2q_attn.shape) context_c2q = tf.multiply(context_hiddens, c2q_attn) context_q2c = tf.multiply(context_hiddens, q2c_attn) blended_reps = tf.concat( [context_hiddens, c2q_attn, context_c2q, context_q2c], axis=2) # (batch_size, context_hiddens, hidden_size*8) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) if self.FLAGS.start_lstm_decode is False: with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) else: with vs.variable_scope("StartDist"): start_decode_layer = StartDecodeLayer(self.FLAGS.hidden_size, self.keep_prob) self.logits_start, self.probdist_start = start_decode_layer.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) if self.FLAGS.cond_pred is False: with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask) else: logits_start_float32 = tf.expand_dims(tf.cast(self.logits_start, dtype=tf.float32), axis=2) logits_start_float32 = logits_start_float32 + tf.zeros( shape=(1, blended_reps_final.shape[1], blended_reps_final.shape[2]), dtype=tf.float32) print(blended_reps_final.dtype, blended_reps_final.shape, logits_start_float32.dtype, logits_start_float32.shape) comb_blended_reps = tf.concat( [blended_reps_final, logits_start_float32], axis=2) with vs.variable_scope("EndDist"): conditional_output_layer = ConditionalOutputLayer( self.FLAGS.hidden_size, self.keep_prob) self.logits_end, self.probdist_end = conditional_output_layer.build_graph( comb_blended_reps, self.context_mask)