def context_layer(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None): with tf.variable_scope('context_layer'): for i in xrange( options.context_layer_num): # support multiple context layer with tf.variable_scope('layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) question_context_representation = tf.concat( axis=2, values=[ question_context_representation_fw, question_context_representation_bw ]) passage_context_representation = tf.concat( axis=2, values=[ passage_context_representation_fw, passage_context_representation_bw ]) return (question_context_representation, passage_context_representation)
def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== in_question_repres = [] # word and char in_passage_repres = [] # word and char input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim, input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1) question_char_outputs_bw = question_char_outputs_bw[:, 0, :] question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim]) (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim, input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1) passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :] passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw]) passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += 2*options.char_lstm_dim in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def bilateral_match_func(self, in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim): question_aware_representatins = [] question_aware_dim = 0 passage_aware_representatins = [] passage_aware_dim = 0 # ====word level matching====== (match_reps, match_dim) = self.match_passage_with_question(in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, question_lengths, input_dim, scope="word_match_forward", with_full_match=False, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = self.match_passage_with_question(in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, passage_lengths, input_dim, scope="word_match_backward", with_full_match=False, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim with tf.variable_scope('context_MP_matching'): for i in range(self.config.context_layer_num): # support multiple context layer with tf.variable_scope('layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, self.config.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, dropout_rate=self.dropout_rate, use_cudnn=self.config.use_cudnn) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, self.config.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, dropout_rate=self.dropout_rate, use_cudnn=self.config.use_cudnn) # Multi-perspective matching with tf.variable_scope('left_MP_matching'): (match_reps, match_dim) = self.match_passage_with_question(passage_context_representation_fw, question_context_representation_fw, passage_mask, question_mask, passage_lengths, question_lengths, self.config.context_lstm_dim, scope="forward_match", with_full_match=self.config.with_full_match, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = self.match_passage_with_question(passage_context_representation_bw, question_context_representation_bw, passage_mask, question_mask, passage_lengths, question_lengths, self.config.context_lstm_dim, scope="backward_match", with_full_match=self.config.with_full_match, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim with tf.variable_scope('right_MP_matching'): (match_reps, match_dim) = self.match_passage_with_question(question_context_representation_fw, passage_context_representation_fw, question_mask, passage_mask, question_lengths, passage_lengths, self.config.context_lstm_dim, scope="forward_match", with_full_match=self.config.with_full_match, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=True) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim (match_reps, match_dim) = self.match_passage_with_question(question_context_representation_bw, passage_context_representation_bw, question_mask, passage_mask, question_lengths, passage_lengths, self.config.context_lstm_dim, scope="backward_match", with_full_match=self.config.with_full_match, with_maxpool_match=self.config.with_maxpool_match, with_attentive_match=self.config.with_attentive_match, with_max_attentive_match=self.config.with_max_attentive_match, dropout_rate=self.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim question_aware_representatins = tf.concat(axis=2, values=question_aware_representatins) # [batch_size, passage_len, passage_aware_dim] passage_aware_representatins = tf.concat(axis=2, values=passage_aware_representatins) # [batch_size, question_len, question_aware_dim] question_aware_representatins = tf.nn.dropout(question_aware_representatins, (1 - self.dropout_rate)) passage_aware_representatins = tf.nn.dropout(passage_aware_representatins, (1 - self.dropout_rate)) # ======Highway layer====== if self.config.with_match_highway: with tf.variable_scope("left_matching_highway"): question_aware_representatins = self.multi_highway_layer(question_aware_representatins, question_aware_dim, self.config.highway_layer_num) with tf.variable_scope("right_matching_highway"): passage_aware_representatins = self.multi_highway_layer(passage_aware_representatins, passage_aware_dim, self.config.highway_layer_num) #========Aggregation Layer====== aggregation_representation = [] aggregation_dim = 0 qa_aggregation_input = question_aware_representatins pa_aggregation_input = passage_aware_representatins with tf.variable_scope('aggregation_layer'): for i in range(self.config.aggregation_layer_num): # support multiple aggregation layer qa_aggregation_input = tf.multiply(qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( qa_aggregation_input, self.config.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i), reuse=False, dropout_rate=self.dropout_rate, use_cudnn=self.config.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, passage_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * self.config.aggregation_lstm_dim # [batch_size, question_len, 2*aggregation_lstm_dim] qa_aggregation_input = cur_aggregation_representation pa_aggregation_input = tf.multiply(pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( pa_aggregation_input, self.config.aggregation_lstm_dim, input_lengths=question_lengths, scope_name='right_layer-{}'.format(i), reuse=False, dropout_rate=self.dropout_rate, use_cudnn=self.config.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, question_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * self.config.aggregation_lstm_dim # [batch_size, passage_len, 2*aggregation_lstm_dim] pa_aggregation_input = cur_aggregation_representation # [batch_size, 4*aggregation_lstm_dim*aggregation_layer_num] aggregation_representation = tf.concat(axis=1, values=aggregation_representation) # ======Highway layer====== if self.config.with_aggregation_highway: with tf.variable_scope("aggregation_highway"): agg_shape = tf.shape(aggregation_representation) batch_size = agg_shape[0] aggregation_representation = tf.reshape(aggregation_representation, [1, batch_size, aggregation_dim]) aggregation_representation = self.multi_highway_layer(aggregation_representation, aggregation_dim, self.config.highway_layer_num) aggregation_representation = tf.reshape(aggregation_representation, [batch_size, aggregation_dim]) return (aggregation_representation, aggregation_dim)
def Matching_Model(c_emb, q_emb, passage_lengths, question_lengths, c_mask, q_mask, is_training, dropout_rate, options): with tf.variable_scope("Embedding_Encoder_Layer"): q_emb = tf.multiply(q_emb, tf.expand_dims(q_mask, axis=-1)) c_emb = tf.multiply(c_emb, tf.expand_dims(c_mask, axis=-1)) (q_fw, q_bw, q) = layer_utils.my_lstm_layer(q_emb, options.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=dropout_rate, use_cudnn=options.use_cudnn) (c_fw, c_bw, c) = layer_utils.my_lstm_layer(c_emb, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=dropout_rate, use_cudnn=options.use_cudnn) q = tf.multiply(q, tf.expand_dims(q_mask, axis=-1)) c = tf.multiply(c, tf.expand_dims(c_mask, axis=-1)) with tf.variable_scope("Co-attention_Layer"): c2q, q2c = dot_attention(q, c, q_mask, c_mask) with tf.variable_scope("Model_Encoder_Layer"): passage_inputs = tf.concat([c2q, c, c2q * c, c - c2q], axis=2) question_inputs = tf.concat([q2c, q, q2c * q, q - q2c], axis=2) passage_inputs = tf.layers.dense(inputs=passage_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=False) question_inputs = tf.layers.dense(inputs=question_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=True) question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( question_inputs, options.aggregation_lstm_dim, input_lengths=question_lengths, scope_name='aggregate_layer', reuse=False, is_training=is_training, dropout_rate=dropout_rate, use_cudnn=options.use_cudnn) question_inputs = cur_aggregation_representation (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( passage_inputs, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='aggregate_layer', reuse=True, is_training=is_training, dropout_rate=dropout_rate, use_cudnn=options.use_cudnn) passage_inputs = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] # if is_training: # question_inputs = tf.nn.dropout(question_inputs, (1 - options.dropout_rate)) # passage_inputs = tf.nn.dropout(passage_inputs, (1 - options.dropout_rate)) question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) passage_outputs_mean = tf.div( tf.reduce_sum(passage_inputs, 1), tf.expand_dims(tf.cast(passage_lengths, tf.float32), -1)) question_outputs_mean = tf.div( tf.reduce_sum(question_inputs, 1), tf.expand_dims(tf.cast(question_lengths, tf.float32), -1)) passage_outputs_max = tf.reduce_max(passage_inputs, axis=1) question_outputs_max = tf.reduce_max(question_inputs, axis=1) input_dim = int(passage_inputs.shape[2]) question_outputs = tf.concat( [question_outputs_max, question_outputs_mean], axis=1) passage_outputs = tf.concat( [passage_outputs_max, passage_outputs_mean], axis=1) match_representation = tf.concat( axis=1, values=[question_outputs, passage_outputs]) # ========Prediction Layer========= if is_training: match_representation = tf.nn.dropout(match_representation, (1 - dropout_rate)) return match_representation
def MCAN_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, scope="default", options=None): question_reps = in_question_repres passage_reps = in_passage_repres total_match_dim = 0 final_question_repres=question_reps final_passage_repres=passage_reps ##### (match_reps, match_dim) = match_passage_with_question(in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, question_lengths, input_dim, scope="word_match_forward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) final_passage_repres = tf.concat([final_passage_repres, match_reps], axis=-1) total_match_dim+=match_dim (match_reps, match_dim) = match_passage_with_question(in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, passage_lengths, input_dim, scope="word_match_backward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) final_question_repres = tf.concat([final_question_repres, match_reps], axis=-1) ##### # self-attention # relevancy_matrix3 = cal_relevancy_matrix(question_reps, question_reps) # relevancy_matrix3 = mask_relevancy_matrix(relevancy_matrix3, question_mask, question_mask) # relevancy_matrix3 = tf.nn.softmax(relevancy_matrix3,axis=-1) # relevancy_matrix3 = mask_relevancy_matrix(relevancy_matrix3, question_mask, question_mask) # attended_question = tf.matmul(relevancy_matrix3,question_reps) # final_question_repres=tf.concat([final_question_repres,tf.layers.dense(attended_question, units=5)],axis=-1) # # relevancy_matrix4 = cal_relevancy_matrix(passage_reps, passage_reps) # relevancy_matrix4 = mask_relevancy_matrix(relevancy_matrix4, passage_mask, passage_mask) # relevancy_matrix4 = tf.nn.softmax(relevancy_matrix4, axis=-1) # relevancy_matrix4 = mask_relevancy_matrix(relevancy_matrix4, passage_mask, passage_mask) # attended_passage = tf.matmul(relevancy_matrix4, passage_reps) # final_passage_repres = tf.concat([final_passage_repres, tf.layers.dense(attended_passage, units=5)], # axis=-1) # LSTM-matching in_question_repres_masked = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) in_passage_repres_masked = tf.multiply(in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres_masked) = layer_utils.my_lstm_layer( in_question_repres_masked, options.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres_masked) = layer_utils.my_lstm_layer( in_passage_repres_masked, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Multi-perspective matching with tf.variable_scope('left_MP_matching'): (match_reps, match_dim) = match_passage_with_question(passage_context_representation_fw, question_context_representation_fw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) final_passage_repres = tf.concat([final_passage_repres, match_reps], axis=-1) total_match_dim+=match_dim (match_reps, match_dim) = match_passage_with_question(passage_context_representation_bw, question_context_representation_bw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) final_passage_repres = tf.concat([final_passage_repres, match_reps], axis=-1) total_match_dim += match_dim with tf.variable_scope('right_MP_matching'): (match_reps, match_dim) = match_passage_with_question(question_context_representation_fw, passage_context_representation_fw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) final_question_repres = tf.concat([final_question_repres, match_reps], axis=-1) (match_reps, match_dim) = match_passage_with_question(question_context_representation_bw, passage_context_representation_bw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) final_question_repres = tf.concat([final_question_repres, match_reps], axis=-1) if is_training: final_question_repres = tf.nn.dropout(final_question_repres, (1 - options.dropout_rate)) final_passage_repres = tf.nn.dropout(final_passage_repres, (1 - options.dropout_rate)) print(total_match_dim) # ======Highway layer====== #if options.with_match_highway: # with tf.variable_scope("left_matching_highway"): # final_question_repres = multi_highway_layer(final_question_repres, total_match_dim, # options.highway_layer_num) # with tf.variable_scope("right_matching_highway"): # final_passage_repres = multi_highway_layer(final_passage_repres, total_match_dim, # options.highway_layer_num) # final encoder qa_aggregation_input = final_passage_repres pa_aggregation_input = final_question_repres aggregation_representation = [] aggregation_dim = 0 with tf.variable_scope('aggregation_layer'): for i in range(options.aggregation_layer_num): # support multiple aggregation layer if passage_mask != None: qa_aggregation_input = tf.multiply(qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name=scope + '_left_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, passage_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim qa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] if question_mask != None: pa_aggregation_input = tf.multiply(pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( pa_aggregation_input, options.aggregation_lstm_dim, input_lengths=question_lengths, scope_name=scope + '_right_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm(fw_rep, question_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim pa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] aggregation_representation = tf.concat(axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] # ======Highway layer====== if options.with_aggregation_highway: with tf.variable_scope(scope + "_aggregation_highway"): agg_shape = tf.shape(aggregation_representation) batch_size = agg_shape[0] aggregation_representation = tf.reshape(aggregation_representation, [1, batch_size, aggregation_dim]) aggregation_representation = multi_highway_layer(aggregation_representation, aggregation_dim, options.highway_layer_num) aggregation_representation = tf.reshape(aggregation_representation, [batch_size, aggregation_dim]) return (aggregation_representation, aggregation_dim)
def create_siameseLSTM_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): """ """ options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.embedding = tf.placeholder( tf.float32, shape=word_vocab.word_vecs.shape) self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=self.embedding, dtype=tf.float32) # tf.constant(word_vocab.word_vecs) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # ======BiLSTM context layer====== for i in range( options.context_layer_num): # support multiple context layer with tf.variable_scope('bilstm-layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=self.question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Encode the second sentence, using the same LSTM weights. tf.get_variable_scope().reuse_variables() in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=self.passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) if options.lstm_out_type == 'mean': question_context_representation_fw = layer_utils.collect_mean_step_of_lstm( question_context_representation_fw) question_context_representation_bw = layer_utils.collect_mean_step_of_lstm( question_context_representation_bw) passage_context_representation_fw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_fw) passage_context_representation_bw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_bw) elif options.lstm_out_type == 'end': question_context_representation_fw = layer_utils.collect_final_step_of_lstm( question_context_representation_fw, self.question_lengths - 1) question_context_representation_bw = question_context_representation_bw[:, 0, :] passage_context_representation_fw = layer_utils.collect_final_step_of_lstm( passage_context_representation_fw, self.passage_lengths - 1) passage_context_representation_bw = passage_context_representation_bw[:, 0, :] question_context_outputs = tf.concat( axis=1, values=[ question_context_representation_fw, question_context_representation_bw ]) passage_context_outputs = tf.concat( axis=1, values=[ passage_context_representation_fw, passage_context_representation_bw ]) (match_representation, match_dim) = match_utils.siameseLSTM_match_func( question_context_outputs, passage_context_outputs, options.context_lstm_dim) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)], dtype=tf.float32) b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32) w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l1 > 0.0: l1_loss = tf.add_n([ tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l1_loss if self.options.lambda_l2 > 0.0: # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) l2_loss = tf.add_n([ tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def MCAN_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, options=None): question_aware_representatins = [] question_aware_dim = 0 passage_aware_representatins = [] passage_aware_dim = 0 # ====word level matching====== # because the with_full_match allways False, so it has no significance that the forward is True or False. # match_passage_with_question(repres1,repres2,...) is to calculate each vector of repres1 to match whole repres2, so the return match_reps size is[batchSize,repres1.length,repre_dim] # passage to question (match_reps, match_dim) = match_passage_with_question( in_passage_repres, in_question_repres, passage_mask, question_mask, passage_lengths, question_lengths, input_dim, scope="word_match_forward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim # add passage to passage (match_reps, match_dim) = match_passage_with_question( in_passage_repres, in_passage_repres, passage_mask, passage_mask, passage_lengths, passage_lengths, input_dim, scope="word_match_passage", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim # question to passage (match_reps, match_dim) = match_passage_with_question( in_question_repres, in_passage_repres, question_mask, passage_mask, question_lengths, passage_lengths, input_dim, scope="word_match_backward", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim # add question to question (match_reps, match_dim) = match_passage_with_question( in_question_repres, in_question_repres, question_mask, question_mask, question_lengths, question_lengths, input_dim, scope="word_match_question", with_full_match=False, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options.with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim with tf.variable_scope('context_MP_matching'): for i in range( options.context_layer_num): # support multiple context layer with tf.variable_scope('layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Multi-perspective matching with tf.variable_scope('left_MP_matching'): (match_reps, match_dim) = match_passage_with_question( passage_context_representation_fw, question_context_representation_fw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="ques_forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( passage_context_representation_bw, question_context_representation_bw, passage_mask, question_mask, passage_lengths, question_lengths, options.context_lstm_dim, scope="ques_backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim # add passage to passage (match_reps, match_dim) = match_passage_with_question( passage_context_representation_fw, passage_context_representation_fw, passage_mask, passage_mask, passage_lengths, passage_lengths, options.context_lstm_dim, scope="pass_self_forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) question_aware_representatins.append(match_reps) question_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( passage_context_representation_bw, passage_context_representation_bw, passage_mask, passage_mask, passage_lengths, passage_lengths, options.context_lstm_dim, scope="pass_self_backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) question_aware_representatins.append(match_reps) question_aware_dim += match_dim with tf.variable_scope('right_MP_matching'): (match_reps, match_dim) = match_passage_with_question( question_context_representation_fw, passage_context_representation_fw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="pass_forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( question_context_representation_bw, passage_context_representation_bw, question_mask, passage_mask, question_lengths, passage_lengths, options.context_lstm_dim, scope="pass_backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim # add question to question (match_reps, match_dim) = match_passage_with_question( question_context_representation_fw, question_context_representation_fw, question_mask, question_mask, question_lengths, question_lengths, options.context_lstm_dim, scope="ques_self_forward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=True) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim (match_reps, match_dim) = match_passage_with_question( question_context_representation_bw, question_context_representation_bw, question_mask, question_mask, question_lengths, question_lengths, options.context_lstm_dim, scope="ques_self_backward_match", with_full_match=options.with_full_match, with_maxpool_match=options.with_maxpool_match, with_attentive_match=options.with_attentive_match, with_max_attentive_match=options. with_max_attentive_match, is_training=is_training, options=options, dropout_rate=options.dropout_rate, forward=False) passage_aware_representatins.append(match_reps) passage_aware_dim += match_dim question_aware_representatins = tf.concat( axis=2, values=question_aware_representatins ) # [batch_size, passage_len, question_aware_dim] passage_aware_representatins = tf.concat( axis=2, values=passage_aware_representatins ) # [batch_size, question_len, question_aware_dim] if is_training: question_aware_representatins = tf.nn.dropout( question_aware_representatins, (1 - options.dropout_rate)) passage_aware_representatins = tf.nn.dropout( passage_aware_representatins, (1 - options.dropout_rate)) # ======Highway layer====== if options.with_match_highway: with tf.variable_scope("left_matching_highway"): question_aware_representatins = multi_highway_layer( question_aware_representatins, question_aware_dim, options.highway_layer_num) with tf.variable_scope("right_matching_highway"): passage_aware_representatins = multi_highway_layer( passage_aware_representatins, passage_aware_dim, options.highway_layer_num) # ========Aggregation Layer====== aggregation_representation = [] aggregation_dim = 0 qa_aggregation_input = question_aware_representatins pa_aggregation_input = passage_aware_representatins with tf.variable_scope('aggregation_layer'): for i in range(options.aggregation_layer_num ): # support multiple aggregation layer qa_aggregation_input = tf.multiply( qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name='left_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, passage_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim qa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] pa_aggregation_input = tf.multiply( pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( pa_aggregation_input, options.aggregation_lstm_dim, input_lengths=question_lengths, scope_name='right_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, question_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim pa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] aggregation_representation = tf.concat( axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] # ======Highway layer====== if options.with_aggregation_highway: with tf.variable_scope("aggregation_highway"): agg_shape = tf.shape(aggregation_representation) batch_size = agg_shape[0] aggregation_representation = tf.reshape( aggregation_representation, [1, batch_size, aggregation_dim]) aggregation_representation = multi_highway_layer( aggregation_representation, aggregation_dim, options.highway_layer_num) aggregation_representation = tf.reshape( aggregation_representation, [batch_size, aggregation_dim]) return (aggregation_representation, aggregation_dim)
def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, lemma_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== with tf.variable_scope("Input_Embedding_Layer"): if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) # self.kg_embedding = tf.get_variable("kg", trainable=True, regularizer=regularizer, # initializer=tf.constant(lemma_vocab.word_vecs), dtype=tf.float32) self.kg_embedding = tf.get_variable( "kg", shape=(lemma_vocab.word_vecs.shape[0], options.kg_dim), initializer=initializer, trainable=True, dtype=tf.float32) c_emb = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) q_emb = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) c_kg_emb = tf.nn.embedding_lookup(self.kg_embedding, self.in_passage_words_lemma) q_kg_emb = tf.nn.embedding_lookup(self.kg_embedding, self.in_question_words_lemma) if is_training: c_emb = tf.nn.dropout(c_emb, 1 - self.dropout) q_emb = tf.nn.dropout(q_emb, 1 - self.dropout) c_kg_emb = tf.nn.dropout(c_kg_emb, 1 - self.dropout) q_kg_emb = tf.nn.dropout(q_kg_emb, 1 - self.dropout) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable( "char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask( question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply( in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask( passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply( in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) question_char_outputs = conv(in_question_char_repres, self.options.char_lstm_dim, bias=True, activation=tf.nn.tanh, kernel_size=5, name="char_conv", reuse=False) question_char_outputs = tf.reduce_max(question_char_outputs, axis=1) question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, options.char_lstm_dim]) passage_char_outputs = conv(in_passage_char_repres, self.options.char_lstm_dim, bias=True, activation=tf.nn.tanh, kernel_size=5, name="char_conv", reuse=True) passage_char_outputs = tf.reduce_max(passage_char_outputs, axis=1) passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, options.char_lstm_dim]) c_emb = tf.concat([c_emb, passage_char_outputs], axis=2) q_emb = tf.concat([q_emb, question_char_outputs], axis=2) c_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] q_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] with tf.variable_scope("Embedding_Encoder_Layer"): q_emb = tf.multiply(q_emb, tf.expand_dims(q_mask, axis=-1)) c_emb = tf.multiply(c_emb, tf.expand_dims(c_mask, axis=-1)) q_kg_emb = tf.multiply( q_kg_emb, tf.expand_dims(tf.cast(q_mask, tf.float32), axis=-1)) c_kg_emb = tf.multiply( c_kg_emb, tf.expand_dims(tf.cast(c_mask, tf.float32), axis=-1)) (q_fw, q_bw, q) = layer_utils.my_lstm_layer( q_emb, options.context_lstm_dim, input_lengths=self.question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) (c_fw, c_bw, c) = layer_utils.my_lstm_layer(c_emb, options.context_lstm_dim, input_lengths=self.passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) q = tf.multiply(q, tf.expand_dims(q_mask, axis=-1)) c = tf.multiply(c, tf.expand_dims(c_mask, axis=-1)) if is_training: q = tf.nn.dropout(q, 1 - self.dropout) c = tf.nn.dropout(c, 1 - self.dropout) with tf.variable_scope('co-att', reuse=tf.AUTO_REUSE): s = tf.einsum("abd,acd->abc", c, q) # cRq, loss = Complex(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization') # cRq, loss, r = Analogy(c_kg_emb, q_kg_emb, c_mask, q_mask, options.scalar_dim, # options.kg_dim, options.relation_dim, loss_type='factorization') # cRq, loss = DisMult(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization') cRq, r = Rescal(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim) # if is_training: v = tf.get_variable("v", [1, 1, 1, options.relation_dim], dtype=tf.float32) score = tf.reduce_sum(cRq * v, axis=-1) s = s + options.lamda1 * score s = mask_relevancy_matrix(s, q_mask, c_mask) s_q = tf.nn.softmax(s, dim=1) self.v = v q2c = tf.einsum("abd,abc->acd", c, s_q) q2c_kg = tf.einsum("abd,abc->acd", c_kg_emb, s_q) q2c_kg_r = tf.einsum("abcr,abc->acr", cRq, s_q) s_c = tf.nn.softmax(s, dim=2) c2q = tf.einsum("abd,acb->acd", q, s_c) c2q_kg = tf.einsum("abd,acb->acd", q_kg_emb, s_c) c2q_kg_r = tf.einsum("abcr,abc->abr", cRq, s_c) with tf.variable_scope("Model_Encoder_Layer"): passage_inputs = tf.concat( [c2q, c, c2q * c, c - c2q, c_kg_emb, c2q_kg, c2q_kg_r], axis=2) question_inputs = tf.concat( [q2c, q, q2c * q, q - q2c, q_kg_emb, q2c_kg, q2c_kg_r], axis=2) passage_inputs = tf.layers.dense(inputs=passage_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=False) question_inputs = tf.layers.dense(inputs=question_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=True) question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( question_inputs, options.aggregation_lstm_dim, input_lengths=self.question_lengths, scope_name='aggregate_layer', reuse=False, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) question_inputs = cur_aggregation_representation # question_outputs_vec = tf.concat([fw_rep, bw_rep], axis=1) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( passage_inputs, options.aggregation_lstm_dim, input_lengths=self.passage_lengths, scope_name='aggregate_layer', reuse=True, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) passage_inputs = cur_aggregation_representation question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) if is_training: question_inputs = tf.nn.dropout(question_inputs, 1 - self.dropout) passage_inputs = tf.nn.dropout(passage_inputs, 1 - self.dropout) passage_outputs_mean = tf.div( tf.reduce_sum(passage_inputs, 1), tf.expand_dims(tf.cast(self.passage_lengths, tf.float32), -1)) question_outputs_mean = tf.div( tf.reduce_sum(question_inputs, 1), tf.expand_dims(tf.cast(self.question_lengths, tf.float32), -1)) passage_outputs_max = tf.reduce_max(passage_inputs, axis=1) question_outputs_max = tf.reduce_max(question_inputs, axis=1) passage_outputs_att = soft_attention_with_kg(passage_inputs, c_kg_emb, c2q_kg_r, c_mask, options.att_dim, scope="soft_att", reuse=False) question_outputs_att = soft_attention_with_kg(question_inputs, q_kg_emb, q2c_kg_r, q_mask, options.att_dim, scope="soft_att", reuse=True) question_outputs = tf.concat([ question_outputs_max, question_outputs_mean, question_outputs_att ], axis=1) passage_outputs = tf.concat([ passage_outputs_max, passage_outputs_mean, passage_outputs_att ], axis=1) match_representation = tf.concat( axis=1, values=[question_outputs, passage_outputs]) # ========Prediction Layer========= match_dim = int(match_representation.shape[1]) w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) if is_training: match_representation = tf.nn.dropout(match_representation, (1 - self.dropout)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - self.dropout)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return if options.loss_type == 'logistic': matrix = self.matrix * 2 - 1 matrix = mask_relevancy_4dmatrix(matrix, q_mask, c_mask) score = -1 * tf.log(tf.nn.sigmoid(matrix * cRq)) else: score = self.matrix - cRq score = 1 / 2 * score * score score = mask_relevancy_4dmatrix(score, q_mask, c_mask) KGE_loss = tf.reduce_sum(score, axis=-1) self.loss = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) self.loss = self.loss + options.lamda2 * tf.reduce_sum( tf.layers.flatten(KGE_loss)) tvars = tf.trainable_variables() if self.options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if tf.trainable_variables() if not 'embedding' in v.name ]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adagard': optimizer = tf.train.AdagradOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def MCAN_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, passage_mask, input_dim, is_training, scope="default", options=None): question_reps = in_question_repres passage_reps = in_passage_repres relevancy_matrix = cal_relevancy_matrix(question_reps, passage_reps) relevancy_matrix = mask_relevancy_matrix(relevancy_matrix, question_mask, passage_mask) in_passage_repres = tf.concat([ in_passage_repres, tf.reduce_max(relevancy_matrix, axis=2, keep_dims=True) ], axis=-1) in_passage_repres = tf.concat([ in_passage_repres, tf.reduce_mean(relevancy_matrix, axis=2, keep_dims=True) ], axis=-1) qa_aggregation_input = in_passage_repres pa_aggregation_input = in_question_repres aggregation_representation = [] aggregation_dim = 0 with tf.variable_scope('aggregation_layer'): for i in range(options.aggregation_layer_num ): # support multiple aggregation layer if passage_mask != None: qa_aggregation_input = tf.multiply( qa_aggregation_input, tf.expand_dims(passage_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( qa_aggregation_input, options.aggregation_lstm_dim, input_lengths=passage_lengths, scope_name=scope + '_left_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, passage_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim qa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] if question_mask != None: pa_aggregation_input = tf.multiply( pa_aggregation_input, tf.expand_dims(question_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( pa_aggregation_input, options.aggregation_lstm_dim, input_lengths=question_lengths, scope_name=scope + '_right_layer-{}'.format(i), reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) fw_rep = layer_utils.collect_final_step_of_lstm( fw_rep, question_lengths - 1) bw_rep = bw_rep[:, 0, :] aggregation_representation.append(fw_rep) aggregation_representation.append(bw_rep) aggregation_dim += 2 * options.aggregation_lstm_dim pa_aggregation_input = cur_aggregation_representation # [batch_size, passage_len, 2*aggregation_lstm_dim] aggregation_representation = tf.concat( axis=1, values=aggregation_representation) # [batch_size, aggregation_dim] # ======Highway layer====== if options.with_aggregation_highway: with tf.variable_scope(scope + "_aggregation_highway"): agg_shape = tf.shape(aggregation_representation) batch_size = agg_shape[0] aggregation_representation = tf.reshape( aggregation_representation, [1, batch_size, aggregation_dim]) aggregation_representation = multi_highway_layer( aggregation_representation, aggregation_dim, options.highway_layer_num) aggregation_representation = tf.reshape( aggregation_representation, [batch_size, aggregation_dim]) return (aggregation_representation, aggregation_dim)
def FusionNet_match_Amit(feature_dim, feature_each_dim, passage, question, passage_length, question_length, passage_mask, question_mask, onehot_binary=None, options=None, scope_name='FusionNet_Amit_match_layer', is_training=True, dropout_rate=0.2, reuse=False): batch_size = tf.shape(passage)[0] passage_len = tf.shape(passage)[1] question_len = tf.shape(question)[1] word_dim, char_dim, POS_dim, NER_dim, cove_dim, lm_dim = feature_each_dim with tf.variable_scope(scope_name, reuse=reuse): # Fully Aware MultiLevel Fusion (FAMF) Word Layer with tf.variable_scope('famf_word_layer'): famf_word_level_dim = word_dim # assuming famf_word_level_dim=dim-of-glove=300 p_wordlevel_input = tf.slice( passage, [0, 0, 0], [batch_size, passage_len, word_dim ]) # only use word embedding for word layer q_wordlevel_input = tf.slice(question, [0, 0, 0], [batch_size, question_len, word_dim]) alphas = layer_utils.calcuate_attention( p_wordlevel_input, q_wordlevel_input, famf_word_level_dim, famf_word_level_dim, scope_name="famf_word_layer_attention", att_type=options.att_type, mask1=passage_mask, mask2=question_mask, att_dim=250, is_training=is_training, dropout_rate=dropout_rate) # (in_value_1, in_value_2, feature_dim1, feature_dim2, scope_name='att', # att_type='symmetric', att_dim=20, remove_diagnoal=False, mask1=None, mask2=None, is_training=False, dropout_rate=0.2, # cosine_attention_scale=200) weighted_by_question_words = tf.matmul( alphas, layer_utils.dropout_layer(q_wordlevel_input, dropout_rate, is_training=is_training)) # Reading layer with tf.variable_scope('reading'): q_rep_reading_input = question # [glove, cove, NER, POS] p_rep_reading_input = tf.concat( axis=2, values=[passage, onehot_binary, weighted_by_question_words ]) # use all embeddings for reading and understanding. # [glove, cove, NER, POS, binary,famf_word_attention] with tf.variable_scope('reading_layer_1'): reading_layer_lstm_dim = 125 q_rep_reading_1_output = layer_utils.my_lstm_layer( q_rep_reading_input, reading_layer_lstm_dim, scope_name='bilstm_reading_1_q', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] p_rep_reading_1_output = layer_utils.my_lstm_layer( p_rep_reading_input, reading_layer_lstm_dim, scope_name='bilstm_reading_1_p', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] with tf.variable_scope('reading_layer_2'): q_rep_reading_2_output = layer_utils.my_lstm_layer( q_rep_reading_1_output, reading_layer_lstm_dim, scope_name='bilstm_reading_1_q', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] p_rep_reading_2_output = layer_utils.my_lstm_layer( p_rep_reading_1_output, reading_layer_lstm_dim, scope_name='bilstm_reading_1_p', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] # Understanding Layer with tf.variable_scope('question_understanding_layer'): q_rep_understanding_input = tf.concat( axis=2, values=(q_rep_reading_1_output, q_rep_reading_2_output)) U_q = layer_utils.my_lstm_layer( q_rep_understanding_input, reading_layer_lstm_dim, scope_name='bilstm_understanding_q', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] U_q_dim = reading_layer_lstm_dim * 2 # FAMF : higher level with tf.variable_scope('famf_higher_layer'): famf_higher_layer_w_dim1 = 500 famf_higher_layer_w_dim2 = 250 famf_q_input = [] famf_p_input = [] # famf_p_input.append(in_passage_word_repres) famf_p_input.append(p_wordlevel_input) famf_higher_layer_w_dim1 += word_dim famf_p_input.append(p_rep_reading_1_output) famf_p_input.append(p_rep_reading_2_output) # famf_q_input.append(in_question_word_repres) famf_q_input.append(q_wordlevel_input) famf_q_input.append(q_rep_reading_1_output) famf_q_input.append(q_rep_reading_2_output) cove_dim_begin = word_dim + char_dim + POS_dim + NER_dim if cove_dim != 0: #cove_dim_begin = word_dim + char_dim + POS_dim + NER_dim p_cove_repres = tf.slice(passage, [0, 0, cove_dim_begin], [batch_size, passage_len, cove_dim]) q_cove_repres = tf.slice(question, [0, 0, cove_dim_begin], [batch_size, question_len, cove_dim]) famf_p_input.append(p_cove_repres) famf_q_input.append(q_cove_repres) famf_higher_layer_w_dim1 += cove_dim if lm_dim != 0: lm_dim_begin = cove_dim_begin + cove_dim p_lm_repres = tf.slice(passage, [0, 0, lm_dim_begin], [batch_size, passage_len, lm_dim]) q_lm_repres = tf.slice(question, [0, 0, lm_dim_begin], [batch_size, question_len, lm_dim]) famf_p_input.append(p_lm_repres) famf_q_input.append(q_lm_repres) famf_higher_layer_w_dim1 += lm_dim famf_p_input = tf.concat(axis=2, values=famf_p_input) # (B, P, D ) famf_q_input = tf.concat(axis=2, values=famf_q_input) # (B, Q, D ) alphas = layer_utils.calcuate_attention( famf_p_input, famf_q_input, famf_higher_layer_w_dim1, famf_higher_layer_w_dim1, scope_name="famf_high_lowlevel", att_type=options.att_type, mask1=passage_mask, mask2=question_mask, att_dim=famf_higher_layer_w_dim2, is_training=is_training, dropout_rate=dropout_rate) h_Cl = tf.matmul( alphas, layer_utils.dropout_layer(q_rep_reading_1_output, dropout_rate, is_training=is_training)) alphas = layer_utils.calcuate_attention( famf_p_input, famf_q_input, famf_higher_layer_w_dim1, famf_higher_layer_w_dim1, scope_name="famf_high_highlevel", att_type=options.att_type, mask1=passage_mask, mask2=question_mask, att_dim=famf_higher_layer_w_dim2, is_training=is_training, dropout_rate=dropout_rate) h_Ch = tf.matmul( alphas, layer_utils.dropout_layer(q_rep_reading_2_output, dropout_rate, is_training=is_training)) alphas = layer_utils.calcuate_attention( famf_p_input, famf_q_input, famf_higher_layer_w_dim1, famf_higher_layer_w_dim1, scope_name="famf_high_understandinglevel", att_type=options.att_type, mask1=passage_mask, mask2=question_mask, att_dim=famf_higher_layer_w_dim2, is_training=is_training, dropout_rate=dropout_rate) u_C = tf.matmul( alphas, layer_utils.dropout_layer(U_q, dropout_rate, is_training=is_training)) with tf.variable_scope('famf_higher_layer_passage_lstm'): p_rep_highlayer_input = [] p_rep_highlayer_input.append(p_rep_reading_1_output) p_rep_highlayer_input.append(p_rep_reading_2_output) p_rep_highlayer_input.append(h_Cl) p_rep_highlayer_input.append(h_Ch) p_rep_highlayer_input.append(u_C) p_rep_highlayer_input = tf.concat( axis=2, values=p_rep_highlayer_input) # (B, P, D ) D=(250*5) famf_higher_layer_passage_lstm_dim = 125 V_c = layer_utils.my_lstm_layer( p_rep_highlayer_input, famf_higher_layer_passage_lstm_dim, scope_name='bilstm_higher_layer_p', reuse=False, is_training=is_training, dropout_rate=options.dropout_rate)[2] # [B, Q, 250 ] # FAMF: Self-boosted with tf.variable_scope('famf_selfboosted_layer'): famf_self_boosted_input = [] famf_self_boosted_w_dim1 = 250 * 6 # famf_self_boosted_input.append(in_passage_word_repres) famf_self_boosted_input.append(p_wordlevel_input) famf_self_boosted_w_dim1 += word_dim famf_self_boosted_input.append(p_rep_reading_1_output) famf_self_boosted_input.append(p_rep_reading_2_output) famf_self_boosted_input.append(h_Cl) famf_self_boosted_input.append(h_Ch) famf_self_boosted_input.append(u_C) famf_self_boosted_input.append(V_c) if cove_dim != 0: famf_self_boosted_input.append( tf.slice(passage, [0, 0, cove_dim_begin], [batch_size, passage_len, cove_dim])) famf_self_boosted_w_dim1 += cove_dim # 300 + (250 * 6) + 600(if cove) + 300 (if lm) # if lm_dim != 0: not used in old codebase famf_self_boosted_w_dim2 = 50 # 250 does not fit in memory famf_self_boosted_input = tf.concat( axis=2, values=famf_self_boosted_input ) # (B, P, D ) D=(600 ,300 , 250*6 ) = 2400 useProjectionLayer = True if useProjectionLayer: projection_dim = 50 famf_self_boosted_input_dropout = famf_self_boosted_input famf_self_boosted_projection = layer_utils.projection_layer( famf_self_boosted_input_dropout, famf_self_boosted_w_dim1, projection_dim, scope="self-match-projection") famf_self_boosted_w_dim1 = projection_dim vv_C_input = famf_self_boosted_projection else: vv_C_input = famf_self_boosted_input alphas = layer_utils.calcuate_attention( vv_C_input, vv_C_input, famf_self_boosted_w_dim1, famf_self_boosted_w_dim1, scope_name="famf_selfboosted_layer_attention", att_type=options.att_type, mask1=passage_mask, mask2=passage_mask, att_dim=famf_self_boosted_w_dim2, is_training=is_training, dropout_rate=dropout_rate) vv_C = tf.matmul( alphas, layer_utils.dropout_layer(V_c, dropout_rate, is_training=is_training)) p_rep_selfboosted_layer_input = tf.concat( axis=2, values=(famf_self_boosted_input, vv_C)) return (p_rep_selfboosted_layer_input, 0)
def BiMPM_match(feature_dim, passage, question, passage_length, question_length, passage_mask, question_mask, onehot_binary=None, options=None, scope_name='BiMPM_match_layer', is_training=True, dropout_rate=0.2, reuse=False): match_results = [] match_dim = 0 with tf.variable_scope(scope_name, reuse=reuse): # word-level matching (word_match_reps, word_match_dim, word_PoQ_reps, word_QoP_reps) = onelayer_BiMPM_match(feature_dim, passage, question, passage_mask, question_mask, options=options, scope_name='word_level_BiMPM', is_training=is_training, dropout_rate=dropout_rate, reuse=False) match_results.append(word_match_reps) match_dim += word_match_dim # contextual level matching passage_reps = [passage, word_PoQ_reps] passage_dim = 2 * feature_dim # if onehot_binary is not None: # passage_reps.append(onehot_binary) # passage_dim += 11 question_reps = [question] if options.with_QoP: question_reps.append(word_QoP_reps) passage_context = passage if onehot_binary is not None: passage_context = tf.concat( axis=2, values=[passage_context, onehot_binary]) question_context = question for i in xrange(options.context_layer_num): cur_passage_reps = tf.concat(axis=2, values=passage_reps) cur_question_reps = tf.concat(axis=2, values=question_reps) # lstm over passage and question individually passage_context = layer_utils.my_lstm_layer( passage_context, options.context_lstm_dim, scope_name="passage_context_lstm_{}".format(i), reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] passage_context = tf.multiply( passage_context, tf.expand_dims(passage_mask, axis=-1)) question_context = layer_utils.my_lstm_layer( question_context, options.context_lstm_dim, scope_name="question_context_lstm_{}".format(i), reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] question_context = tf.multiply( question_context, tf.expand_dims(question_mask, axis=-1)) # matching (cur_match_reps, cur_match_dim, cur_PoQ_reps, cur_QoP_reps) = onelayer_BiMPM_match( 2 * options.context_lstm_dim, passage_context, question_context, passage_mask, question_mask, accum_dim=passage_dim, passage_accum=cur_passage_reps, question_accum=cur_question_reps, options=options, scope_name='context_BiMPM_{}'.format(i), is_training=is_training, dropout_rate=dropout_rate, reuse=False) match_results.append(cur_match_reps) match_dim += cur_match_dim if options.accumulate_match_input: passage_reps.append(passage_context) passage_reps.append(cur_PoQ_reps) # passage_reps.append(cur_match_reps) passage_dim += 4 * options.context_lstm_dim question_reps.append(question_context) if options.with_QoP: question_reps.append(cur_QoP_reps) else: # passage_reps = [passage_context, cur_PoQ_reps, cur_match_reps] passage_reps = [passage_context, cur_PoQ_reps] passage_dim = 4 * options.context_lstm_dim question_reps = [question_context] if options.with_QoP: question_reps.append(cur_QoP_reps) match_results = tf.concat(axis=2, values=match_results) if options.with_self_match: cur_passage_reps = tf.concat(axis=2, values=passage_reps) cur_passage_reps_projection = layer_utils.projection_layer( cur_passage_reps, passage_dim, options.self_compress_dim, scope="self-match-projection") self_atten_scores = layer_utils.calcuate_attention( cur_passage_reps_projection, cur_passage_reps_projection, options.self_compress_dim, options.self_compress_dim, scope_name="self_boost_att", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=True, mask1=passage_mask, mask2=passage_mask, is_training=is_training, dropout_rate=dropout_rate) self_match_reps = tf.matmul( self_atten_scores, layer_utils.dropout_layer(match_results, dropout_rate, is_training=is_training)) match_results = tf.concat(axis=2, values=[match_results, self_match_reps]) match_dim = 2 * match_dim return (match_results, match_dim)
def FusionNet_match(feature_dim, passage, question, passage_length, question_length, passage_mask, question_mask, onehot_binary=None, options=None, scope_name='FusionNet_match_layer', is_training=True, dropout_rate=0.2, reuse=False): # passage_mask = None # question_mask = None with tf.variable_scope(scope_name, reuse=reuse): #======= Fully Aware MultiLevel Fusion (FAMF) Word Layer # word_atten_scores = layer_utils.calcuate_attention \ word_atten_scores = layer_utils.calcuate_attention( passage, question, feature_dim, feature_dim, scope_name="FAMF_word", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) weighted_by_question_words = tf.matmul( word_atten_scores, layer_utils.dropout_layer(question, dropout_rate, is_training=is_training)) #====== Reading layer passage_tmp = [passage, weighted_by_question_words] passage_tmp_dim = 2 * feature_dim if onehot_binary is not None: passage_tmp.append(onehot_binary) passage_tmp_dim += 11 passage_tmp = tf.concat(axis=2, values=passage_tmp) passage_context1 = layer_utils.my_lstm_layer( passage_tmp, options.context_lstm_dim, scope_name="passage_context1_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] passage_context2 = layer_utils.my_lstm_layer( passage_context1, options.context_lstm_dim, scope_name="passage_context2_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] question_context1 = layer_utils.my_lstm_layer( question, options.context_lstm_dim, scope_name="question_context1_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] question_context2 = layer_utils.my_lstm_layer( question_context1, options.context_lstm_dim, scope_name="question_context2_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] # ==== Understanding Layer quesiton_understand_input = tf.concat(axis=2, values=(question_context1, question_context2)) quesiton_understand_output = layer_utils.my_lstm_layer( quesiton_understand_input, options.context_lstm_dim, scope_name="question_under_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] # ==== FAMF : higher level famf_passage_input = tf.concat(axis=2, values=(passage, passage_context1, passage_context2)) famf_question_input = tf.concat(axis=2, values=(question, question_context1, question_context2)) passage_in_dim = feature_dim + 4 * options.context_lstm_dim lower_level_atten_scores = layer_utils.calcuate_attention( famf_passage_input, famf_question_input, passage_in_dim, passage_in_dim, scope_name="lower_level_att", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) high_level_atten_scores = layer_utils.calcuate_attention( famf_passage_input, famf_question_input, passage_in_dim, passage_in_dim, scope_name="high_level_att", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) understand_atten_scores = layer_utils.calcuate_attention( famf_passage_input, famf_question_input, passage_in_dim, passage_in_dim, scope_name="understand_att", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=False, mask1=passage_mask, mask2=question_mask, is_training=is_training, dropout_rate=dropout_rate) h_Cl = tf.matmul( lower_level_atten_scores, layer_utils.dropout_layer(question_context1, dropout_rate, is_training=is_training)) h_Ch = tf.matmul( high_level_atten_scores, layer_utils.dropout_layer(question_context2, dropout_rate, is_training=is_training)) u_C = tf.matmul( understand_atten_scores, layer_utils.dropout_layer(quesiton_understand_output, dropout_rate, is_training=is_training)) # ====famf_higher_layer_passage_lstm V_c_input = tf.concat( axis=2, values=[passage_context1, passage_context2, h_Cl, h_Ch, u_C]) V_c = layer_utils.my_lstm_layer( V_c_input, options.context_lstm_dim, scope_name="famf_higher_layer_passage_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate)[2] # VV_c_input = tf.concat(axis=2, values=[passage_tmp, V_c_input, V_c]) # input_dim = 12*options.context_lstm_dim + passage_tmp_dim VV_c_input = tf.concat(axis=2, values=[passage, V_c_input, V_c]) input_dim = 12 * options.context_lstm_dim + feature_dim # ==== FAMF: Self-boosted if options.with_self_match: VV_c_input_projection = layer_utils.projection_layer( VV_c_input, input_dim, options.self_compress_dim, scope="self-boost-projection") self_atten_scores = layer_utils.calcuate_attention( VV_c_input_projection, VV_c_input_projection, options.self_compress_dim, options.self_compress_dim, scope_name="self_boost_att", att_type=options.att_type, att_dim=options.att_dim, remove_diagnoal=options.remove_diagonal, mask1=passage_mask, mask2=passage_mask, is_training=is_training, dropout_rate=dropout_rate) VV_c = tf.matmul( self_atten_scores, layer_utils.dropout_layer(V_c, dropout_rate, is_training=is_training)) VV_c_input = tf.concat(axis=2, values=[VV_c_input, VV_c]) input_dim += 2 * options.context_lstm_dim # match_results = layer_utils.my_lstm_layer(VV_c_input, options.context_lstm_dim, scope_name="match_result", reuse=False, # is_training=is_training, dropout_rate=dropout_rate)[2] # match_dim = 2 * options.context_lstm_dim # return (match_results, match_dim) return (VV_c_input, input_dim)
def multi_perspective_match(feature_dim, repres1, repres2, is_training=True, dropout_rate=0.2, options=None, scope_name='mp-match', reuse=False): ''' :param repres1: [batch_size, len, feature_dim] :param repres2: [batch_size, len, feature_dim] :return: ''' repres1 = layer_utils.dropout_layer(repres1, dropout_rate, is_training=is_training) repres2 = layer_utils.dropout_layer(repres2, dropout_rate, is_training=is_training) input_shape = tf.shape(repres1) batch_size = input_shape[0] seq_length = input_shape[1] matching_result = [] cosine_norm = True with tf.variable_scope(scope_name, reuse=reuse): match_dim = 0 if options.with_cosine: cosine_value = layer_utils.cosine_distance(repres1, repres2, cosine_norm=cosine_norm) cosine_value = tf.reshape(cosine_value, [batch_size, seq_length, 1]) matching_result.append(cosine_value) match_dim += 1 concat_rep = tf.concat(axis=2, values=[repres1, repres2]) if options.with_nn_match: nn_match_W = tf.get_variable( "nn_match_W", [2 * feature_dim, options.nn_match_dim], dtype=tf.float32) nn_match_b = tf.get_variable("nn_match_b", [options.nn_match_dim], dtype=tf.float32) cur_rep = tf.reshape(concat_rep, [batch_size * seq_length, 2 * feature_dim]) cur_match_result = tf.tanh( tf.matmul(cur_rep, nn_match_W) + nn_match_b) cur_match_result = tf.reshape( cur_match_result, [batch_size, seq_length, options.nn_match_dim]) matching_result.append(cur_match_result) match_dim += options.nn_match_dim if options.with_mp_cosine: if options.mp_cosine_proj_dim > 0: mp_cosine_projection = tf.get_variable( "mp_cosine_projection", [feature_dim, options.mp_cosine_proj_dim], dtype=tf.float32) mp_cosine_params = tf.get_variable( "mp_cosine", shape=[ 1, options.cosine_MP_dim, options.mp_cosine_proj_dim ], dtype=tf.float32) repres1_flat = tf.reshape( repres1, [batch_size * seq_length, feature_dim]) repres2_flat = tf.reshape( repres2, [batch_size * seq_length, feature_dim]) repres1_flat = tf.tanh( tf.matmul(repres1_flat, mp_cosine_projection)) repres2_flat = tf.tanh( tf.matmul(repres2_flat, mp_cosine_projection)) repres1_flat = tf.expand_dims(repres1_flat, axis=1) repres2_flat = tf.expand_dims(repres2_flat, axis=1) mp_cosine_matching = layer_utils.cosine_distance( tf.multiply(repres1_flat, mp_cosine_params), repres2_flat, cosine_norm=cosine_norm) mp_cosine_matching = tf.reshape( mp_cosine_matching, [batch_size, seq_length, options.cosine_MP_dim]) else: mp_cosine_params = tf.get_variable( "mp_cosine", shape=[1, 1, options.cosine_MP_dim, feature_dim], dtype=tf.float32) repres1_flat = tf.expand_dims(repres1, axis=2) repres2_flat = tf.expand_dims(repres2, axis=2) mp_cosine_matching = layer_utils.cosine_distance( tf.multiply(repres1_flat, mp_cosine_params), repres2_flat, cosine_norm=cosine_norm) matching_result.append(mp_cosine_matching) match_dim += options.cosine_MP_dim if options.with_match_lstm: (_, _, match_lstm_result) = layer_utils.my_lstm_layer( concat_rep, options.match_lstm_dim, scope_name="match_lstm", reuse=False, is_training=is_training, dropout_rate=dropout_rate) matching_result.append(match_lstm_result) match_dim += 2 * options.match_lstm_dim matching_result = tf.concat(axis=2, values=matching_result) return (matching_result, match_dim)
def MPCM_match(feature_dim, passage, question, passage_length, question_length, passage_mask, question_mask, options=None, scope_name='MPCM_match_layer', is_training=True, dropout_rate=0.2, reuse=False): match_results = [] match_dim = 0 with tf.variable_scope(scope_name, reuse=reuse): if options.with_word_match: (word_match_reps, word_match_dim) = multi_granularity_match( feature_dim, passage, question, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=False, with_attentive_matching=True, with_max_attentive_matching=True, scope_name='word_match', reuse=False) match_results.append(word_match_reps) match_dim += word_match_dim if options.with_sequential_match: cur_passage_context = None cur_question_context = None for i in xrange(options.context_layer_num): if cur_passage_context is None: cur_passage_context = passage cur_question_context = question else: cur_passage_context = tf.concat( axis=2, values=[passage, cur_passage_context]) cur_question_context = tf.concat( axis=2, values=[question, cur_question_context]) (cur_passage_context_fw, cur_passage_context_bw, cur_passage_context) = layer_utils.my_lstm_layer( cur_passage_context, options.context_lstm_dim, scope_name="passage_context_lstm_{}".format(i), reuse=False, is_training=is_training, dropout_rate=dropout_rate) cur_passage_context_fw = tf.multiply( cur_passage_context_fw, tf.expand_dims(passage_mask, axis=-1)) cur_passage_context_bw = tf.multiply( cur_passage_context_bw, tf.expand_dims(passage_mask, axis=-1)) cur_passage_context = tf.multiply( cur_passage_context, tf.expand_dims(passage_mask, axis=-1)) (cur_question_context_fw, cur_question_context_bw, cur_question_context) = layer_utils.my_lstm_layer( cur_question_context, options.context_lstm_dim, scope_name="question_context_lstm_{}".format(i), reuse=False, is_training=is_training, dropout_rate=dropout_rate) cur_question_context_fw = tf.multiply( cur_question_context_fw, tf.expand_dims(question_mask, axis=-1)) cur_question_context_bw = tf.multiply( cur_question_context_bw, tf.expand_dims(question_mask, axis=-1)) cur_question_context = tf.multiply( cur_question_context, tf.expand_dims(question_mask, axis=-1)) if options.with_attentive_match: # forward matching (cur_match_rep, cur_match_dim) = multi_granularity_match( options.context_lstm_dim, cur_passage_context_fw, cur_question_context_fw, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=False, with_attentive_matching=True, with_max_attentive_matching=True, scope_name='seq_forward_match_{}'.format(i)) match_dim += cur_match_dim match_results.append(cur_match_rep) # backward matching (cur_match_rep, cur_match_dim) = multi_granularity_match( options.context_lstm_dim, cur_passage_context_bw, cur_question_context_bw, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=False, with_attentive_matching=True, with_max_attentive_matching=True, scope_name='seq_backward_match_{}'.format(i)) match_dim += cur_match_dim match_results.append(cur_match_rep) if options.with_full_match: # full matching (cur_match_rep, cur_match_dim) = multi_granularity_match( 2 * options.context_lstm_dim, cur_passage_context, cur_question_context, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=True, with_attentive_matching=False, with_max_attentive_matching=False, scope_name='seq_full_match_{}'.format(i)) match_dim += cur_match_dim match_results.append(cur_match_rep) if options.with_word_phrase_match: question_context_proj = layer_utils.projection_layer( cur_question_context, 2 * options.context_lstm_dim, feature_dim, activation_func=tf.tanh, scope="question_context_proj_{}".format(i)) (cur_match_rep, cur_match_dim) = multi_granularity_match( feature_dim, passage, question_context_proj, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=False, with_attentive_matching=True, with_max_attentive_matching=True, scope_name='word_phrase_match_{}'.format(i)) match_dim += cur_match_dim match_results.append(cur_match_rep) if options.with_phrase_word_match: passage_context_proj = layer_utils.projection_layer( cur_passage_context, 2 * options.context_lstm_dim, feature_dim, activation_func=tf.tanh, scope="passage_context_proj_{}".format(i)) (cur_match_rep, cur_match_dim) = multi_granularity_match( feature_dim, passage_context_proj, question, passage_length, question_length, passage_mask=passage_mask, question_mask=question_mask, is_training=is_training, dropout_rate=dropout_rate, options=options, with_full_matching=False, with_attentive_matching=True, with_max_attentive_matching=True, scope_name='phrase_word_match_{}'.format(i)) match_dim += cur_match_dim match_results.append(cur_match_rep) match_results = tf.concat(axis=2, values=match_results) return (match_results, match_dim)