def _call(self, inputs): self_vecs, neigh_vecs = inputs if self.mode == "train": neigh_vecs = tf.nn.dropout(neigh_vecs, 1 - self.dropout) self_vecs = tf.nn.dropout(self_vecs, 1 - self.dropout) # reduce_mean performs better than mean_pool neigh_means = tf.reduce_mean(neigh_vecs, axis=1) # neigh_means = mean_pool(neigh_vecs, neigh_len) # [nodes] x [out_dim] from_neighs = tf.matmul(neigh_means, self.vars['neigh_weights']) if self.if_use_high_way: with tf.variable_scope("fw_hidden_highway"): fw_hidden = multi_highway_layer(from_neighs, self.neigh_input_dim, 1) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) if not self.concat: output = tf.add_n([from_self, from_neighs]) else: output = tf.concat([from_self, from_neighs], axis=1) # bias if self.bias: output += self.vars['bias'] return self.act(output), self.output_dim
def build_network(self): self.options = self.config["options"] self.options["batch_size"] = self.batch_size self.highway_layer_num = self.options["highway_layer_num"] self.with_highway = self.options["with_highway"] self.wd = self.config.get("weight_decay", None) self.l2_reg = float(self.config["l2_reg"]) in_question_repres = tf.nn.dropout(self.s1_emb, self.dropout_keep_prob) in_passage_repres = tf.nn.dropout(self.s2_emb, self.dropout_keep_prob) input_dim = self.emb_size # ======Highway layer====== if self.with_highway: with tf.variable_scope(self.scope+"-input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, self.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, self.highway_layer_num) # ========Bilateral Matching===== with tf.variable_scope(self.scope+"-bilateral_matching"): (match_representation, match_dim) = match_utils.bilateral_match_func( in_question_repres, in_passage_repres, self.sent1_token_len, self.sent2_token_len, self.sent1_token_mask, self.sent2_token_mask, input_dim, self.config["mode"], options=self.options, dropout_rate=self.dropout_keep_prob) self.output_features = match_representation #========Prediction Layer========= with tf.variable_scope(self.scope+"-prediction"): # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, self.num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [self.num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) logits = tf.nn.dropout(logits, (self.dropout_keep_prob)) self.estimation = tf.matmul(logits, w_1) + b_1 self.pred_probs = tf.contrib.layers.softmax(self.estimation) self.logits = tf.cast(tf.argmax(self.pred_probs, -1), tf.int32) match_utils.add_reg_without_bias(self.scope)
def add_highway_layer(self, highway_layer_num, tied_aggre=False, reuse_question=None, reuse_choice=None): # Add highway layer on top of matching layer if tied_aggre: name = 'matching_highway' else: name = 'matching_highway_{}'.format(self.matching_id) if self.question_repre_dim > 0: with tf.variable_scope("{}_ques".format(name), reuse=reuse_question): self.question_repre = multi_highway_layer( self.question_repre, self.question_repre_dim, highway_layer_num) if self.choice_repre_dim > 0: with tf.variable_scope("{}_choice".format(name), reuse=reuse_choice): self.choice_repre = multi_highway_layer( self.choice_repre, self.choice_repre_dim, highway_layer_num)
def add_aggregation_highway(self, highway_layer_num, tied_aggre=False, reuse=None): # Add aggregation highway layer (after aggregation LSTM) # if tied_aggre: name = 'aggre_highway' # else: # name='aggre_highway_{}'.format(self.matching_id) with tf.variable_scope(name, reuse=reuse): agg_shape = tf.shape(self.aggregation_representation) batch_size = agg_shape[0] self.aggregation_representation = tf.reshape( self.aggregation_representation, [1, batch_size, self.aggregation_dim]) self.aggregation_representation = multi_highway_layer( self.aggregation_representation, self.aggregation_dim, highway_layer_num) self.aggregation_representation = tf.reshape( self.aggregation_representation, [batch_size, self.aggregation_dim])
def encode(self, is_training=True): options = self.options # ======word representation layer====== in_passage_repres = [] input_dim = 0 if options.with_word and self.word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.variable_scope("embedding"), tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(self.word_vocab.word_vecs), dtype=tf.float32) in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_passage_words) batch_size = input_shape[0] passage_len = input_shape[1] input_dim += self.word_vocab.word_dim if options.with_char and self.char_vocab is not None: input_shape = tf.shape(self.in_passage_chars) batch_size = input_shape[0] passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = self.char_vocab.word_dim self.char_embedding = tf.get_variable( "char_embedding", initializer=tf.constant(self.char_vocab.word_vecs), dtype=tf.float32) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell( options.char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - options.dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # passage representation passage_char_outputs = tf.nn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = collect_final_step_lstm( passage_char_outputs, passage_char_lengths - 1) passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, options.char_lstm_dim]) in_passage_repres.append(passage_char_outputs) input_dim += options.char_lstm_dim if options.with_POS and self.POS_vocab is not None: self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( self.POS_vocab.word_vecs), dtype=tf.float32) in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_passage_POSs) batch_size = input_shape[0] passage_len = input_shape[1] input_dim += self.POS_vocab.word_dim if options.with_NER and self.NER_vocab is not None: self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( self.NER_vocab.word_vecs), dtype=tf.float32) in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_passage_NERs) batch_size = input_shape[0] passage_len = input_shape[1] input_dim += self.NER_vocab.word_dim in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] if options.compress_input: # compress input word vector into smaller vectors w_compress = tf.get_variable( "w_compress_input", [input_dim, options.compress_input_dim], dtype=tf.float32) b_compress = tf.get_variable("b_compress_input", [options.compress_input_dim], dtype=tf.float32) in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim]) in_passage_repres = tf.matmul(in_passage_repres, w_compress) + b_compress in_passage_repres = tf.tanh(in_passage_repres) in_passage_repres = tf.reshape( in_passage_repres, [batch_size, passage_len, options.compress_input_dim]) input_dim = options.compress_input_dim if is_training: in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] # sequential context matching passage_forward = None passage_backward = None all_passage_representation = [] passage_dim = 0 with_lstm = True if with_lstm: with tf.variable_scope('biLSTM'): cur_in_passage_repres = in_passage_repres for i in xrange(options.context_layer_num): with tf.variable_scope('layer-{}'.format(i)): with tf.variable_scope('context_represent'): # parameters context_lstm_cell_fw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) context_lstm_cell_bw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) if is_training: context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_fw, output_keep_prob=(1 - options.dropout_rate)) context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_bw, output_keep_prob=(1 - options.dropout_rate)) # passage representation ((passage_context_representation_fw, passage_context_representation_bw), (passage_forward, passage_backward )) = tf.nn.bidirectional_dynamic_rnn( context_lstm_cell_fw, context_lstm_cell_bw, cur_in_passage_repres, dtype=tf.float32, sequence_length=self.passage_lengths ) # [batch_size, passage_len, context_lstm_dim] # [batch_size, passage_len, 2*context_lstm_dim] cur_in_passage_repres = tf.concat([ passage_context_representation_fw, passage_context_representation_bw ], 2) passage_dim += 2 * options.context_lstm_dim all_passage_representation.append( cur_in_passage_repres) all_passage_representation = tf.concat( all_passage_representation, 2) # [batch_size, passage_len, L*passage_dim] # ======Highway layer====== if options.with_match_highway: with tf.variable_scope("context_highway"): all_passage_representation = match_utils.multi_highway_layer( all_passage_representation, passage_dim, options.highway_layer_num) all_passage_representation = all_passage_representation * tf.expand_dims( passage_mask, axis=-1) # initial state for the LSTM decoder #''' with tf.variable_scope('initial_state_for_decoder'): # Define weights and biases to reduce the cell and reduce the state w_reduce_c = tf.get_variable( 'w_reduce_c', [2 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) w_reduce_h = tf.get_variable( 'w_reduce_h', [2 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) bias_reduce_c = tf.get_variable('bias_reduce_c', [options.gen_hidden_size], dtype=tf.float32) bias_reduce_h = tf.get_variable('bias_reduce_h', [options.gen_hidden_size], dtype=tf.float32) old_c = tf.concat(values=[passage_forward.c, passage_backward.c], axis=1) old_h = tf.concat(values=[passage_forward.h, passage_backward.h], axis=1) new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c) new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h) init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) ''' new_c = tf.zeros([batch_size, options.gen_hidden_size]) new_h = tf.zeros([batch_size, options.gen_hidden_size]) init_state = LSTMStateTuple(new_c, new_h) ''' return (passage_dim, all_passage_representation, init_state)
def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam', lambda_l2=1e-5, with_word=True, with_char=True, with_POS=True, with_NER=True, char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True, filter_layer_threshold=0.2, MP_dim=50, context_layer_num=1, aggregation_layer_num=1, fix_word_vec=False, with_filter_layer=True, with_highway=False, word_level_MP_dim=-1, sep_endpoint=False, end_model_combine=False, with_match_highway=False, with_aggregation_highway=False, highway_layer_num=1, match_to_passage=True, match_to_question=False, match_to_choice=False, with_no_match=False, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, use_options=False, num_options=-1, verbose=False, matching_option=0, concat_context=False, tied_aggre=False, rl_training_method='contrastive', rl_matches=None, cond_training=False, reasonet_training=False, reasonet_steps=5, reasonet_hidden_dim=128, reasonet_lambda=10, reasonet_terminate_mode='original', reasonet_keep_first=False, efficient=False, reasonet_logit_combine='sum', tied_match=False): ''' Matching Options: 0:a1=q->p, a2=c->p, [concat(a1->a2,a2->a1)] 1:a1=q->p, a2=c->p, [a1->a2,a2->a1] 2:[q->p,c->p] 3:a1=p->q, a2=p->c, [a1->a2,a2->a1] 4:[q->p,p->q,p->c] 5:a1=q->p, a2=p->q, a3=p->c,[a3->a1,a3->a2] 6:[p->q,p->c] 7: Gated matching concat_context: Concat question & choice and feed into context LSTM tied_aggre: aggregation layer weights are tied. training_method: contrastive reward or policy gradient or soft voting Efficiency options: cond_training: use a tensorflow boolean to control whether to dropout efficient: the feed_dict will contain each passage only once, with the choice in the format of [num_gates*batch_size, dim] RL training method: soft_voting: Simple voting training without RL contrastive: Basic contrastive reward contrastive_imp: Use (r/b-1) instead of (r-b) as in ReasoNet. Reasonet module options: r_steps: reasonet reading steps r_hidden_dim: When calculating distance, the two repre are linear mapped to this dimension. lambda: multiplier for the terminate gate terminate_mode: original for using 0-1 terminate gate, softmax for using a softmax over all possible steps keep_first: feed reasonet step 0 (the initial state) into the prediction module logit_combine: When deciding whether to stop reading on a question, use voting from all questions (sum) or max activation of all questions(max_pooling) tied_match: Matching layer weights are tied. ''' reasonet_calculated_steps = reasonet_steps + 1 if reasonet_keep_first else reasonet_steps # ======word representation layer====== in_question_repres = [] in_passage_repres = [] in_choice_repres = [] self.question_lengths = tf.placeholder(tf.int32, [None]) self.passage_lengths = tf.placeholder(tf.int32, [None]) self.choice_lengths = tf.placeholder(tf.int32, [None]) self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] if cond_training: self.is_training = tf.placeholder(tf.bool, []) else: self.is_training = is_training self.concat_idx_mat = None self.split_idx_mat_q = None self.split_idx_mat_c = None if matching_option == 7: self.concat_idx_mat = tf.placeholder(tf.int32, [None, None, 2], name='concat_idx_mat') if concat_context: self.split_idx_mat_q = tf.placeholder(tf.int32, [None, None, 2]) self.split_idx_mat_c = tf.placeholder(tf.int32, [None, None, 2]) input_dim = 0 if with_word and word_vocab is not None: self.in_question_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_choice_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' print('!!!shape=', word_vocab.word_vecs.shape) with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_choice_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_choice_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) in_choice_repres.append(in_choice_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_shape = tf.shape(self.in_choice_words) choice_len = input_shape[1] input_dim += word_vocab.word_dim if with_POS and POS_vocab is not None: self.in_question_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += POS_vocab.word_dim if with_NER and NER_vocab is not None: self.in_question_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += NER_vocab.word_dim if with_char and char_vocab is not None: self.question_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.passage_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.choice_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_question_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] self.in_passage_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] self.in_choice_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] input_shape = tf.shape(self.in_question_chars) question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] input_shape = tf.shape(self.in_choice_chars) batch_size = input_shape[0] choice_len = input_shape[1] c_char_len = input_shape[2] char_dim = char_vocab.word_dim # self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) in_choice_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_choice_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_choice_char_repres = tf.reshape( in_choice_char_repres, shape=[-1, c_char_len, char_dim]) choice_char_lengths = tf.reshape(self.choice_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(char_lstm_dim) # dropout if cond_training: char_lstm_cell = SwitchableDropoutWrapper( char_lstm_cell, self.is_training, input_keep_prob=(1 - dropout_rate)) elif is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - dropout_rate)) # if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper(char_lstm_cell, output_keep_prob=(1 - dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:, -1, :] question_char_outputs = tf.reshape( question_char_outputs, [-1, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:, -1, :] passage_char_outputs = tf.reshape( passage_char_outputs, [-1, passage_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # choice representation choice_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_choice_char_repres, sequence_length=choice_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] choice_char_outputs = choice_char_outputs[:, -1, :] choice_char_outputs = tf.reshape( choice_char_outputs, [-1, choice_len, char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) in_choice_repres.append(choice_char_outputs) input_dim += char_lstm_dim in_question_repres = tf.concat(in_question_repres, 2) # [batch_size, question_len, dim] in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] in_choice_repres = tf.concat(in_choice_repres, 2) # [batch_size, passage_len, dim] if cond_training: in_question_repres = match_utils.apply_dropout( in_question_repres, self.is_training, dropout_rate) in_passage_repres = match_utils.apply_dropout( in_passage_repres, self.is_training, dropout_rate) in_choice_repres = match_utils.apply_dropout( in_choice_repres, self.is_training, dropout_rate) elif is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) in_choice_repres = tf.nn.dropout(in_choice_repres, (1 - dropout_rate)) else: in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate)) in_choice_repres = tf.multiply(in_choice_repres, (1 - dropout_rate)) # if is_training: # in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) # in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) # in_choice_repres = tf.nn.dropout(in_choice_repres, (1 - dropout_rate)) # else: # in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate)) # in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate)) # in_choice_repres = tf.multiply(in_choice_repres, (1 - dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] choice_mask = tf.sequence_mask( self.choice_lengths, choice_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_choice_repres = match_utils.multi_highway_layer( in_choice_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== # if verbose: if matching_option == 7: ret_list = gated_trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, self.concat_idx_mat, self.split_idx_mat_q, self.split_idx_mat_c, MP_dim, input_dim, context_layer_num, context_lstm_dim, self.is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, concat_context=concat_context, tied_aggre=tied_aggre, rl_matches=rl_matches, cond_training=cond_training, efficient=efficient, tied_match=tied_match, construct_memory=reasonet_training, debug=verbose) all_match_templates, match_dim, gate_input = ret_list[0:3] if verbose: self.matching_vectors = ret_list[-1] self.matching_vectors.append(gate_input) if reasonet_training: memory = ret_list[3] # tiled_memory_mask=ret_list[4] else: ret_list = match_utils.trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, MP_dim, input_dim, context_layer_num, context_lstm_dim, self.is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, match_to_passage, match_to_question, match_to_choice, with_no_match, debug=verbose, matching_option=matching_option) match_representation, match_dim = ret_list[0:2] if verbose: self.matching_vectors = ret_list[-1] print('check: match_dim=', match_dim) # ========Prediction Layer========= with tf.variable_scope('prediction_layer'): w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) if use_options: w_1 = tf.get_variable("w_1", [match_dim / 2, 1], dtype=tf.float32) b_1 = tf.get_variable("b_1", [1], dtype=tf.float32) else: w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) if matching_option == 7: with tf.variable_scope('rl_decision_gate'): if use_options and (not efficient): gate_input = gate_input[::num_options, :] w_gate = tf.get_variable('w_gate', [ 2 * context_layer_num * context_lstm_dim, len(rl_matches) ], dtype=tf.float32) b_gate = tf.get_variable('b_gate', [len(rl_matches)], dtype=tf.float32) gate_logits = tf.matmul(gate_input, w_gate) + b_gate gate_prob = tf.nn.softmax( gate_logits) # [batch_size/4, num_match] gate_log_prob = tf.nn.log_softmax( gate_logits) # [batch_size/4, num_match] if not reasonet_training: sliced_gate_probs = tf.split(gate_prob, len(rl_matches), axis=1) sliced_gate_log_probs = tf.split(gate_log_prob, len(rl_matches), axis=1) # if use_options: # tile_times=tf.constant([1,num_options]) # else: # tile_times=tf.constant([1,num_classes]) self.gate_prob = gate_prob self.gate_log_prob = gate_log_prob weighted_probs = [] weighted_log_probs = [] all_probs = [] layout = 'question_first' if efficient else 'choice_first' for mid, matcher in enumerate(all_match_templates): matcher.add_softmax_pred(w_0, b_0, w_1, b_1, self.is_training, dropout_rate, use_options, num_options, layout=layout) all_probs.append(matcher.prob) weighted_probs.append( tf.multiply(matcher.prob, sliced_gate_probs[mid])) weighted_log_probs.append( tf.add(matcher.log_prob, sliced_gate_log_probs[mid])) if verbose: self.all_probs = tf.stack(all_probs, axis=0) weighted_log_probs = tf.stack(weighted_log_probs, axis=0) self.weighted_log_probs = weighted_log_probs self.prob = tf.add_n(weighted_probs) weighted_probs = tf.stack(weighted_probs, axis=0) else: self.gate_prob = gate_prob self.gate_log_prob = gate_log_prob # assert efficient with tf.variable_scope('reasonet'): reasonet_module = ReasoNetModule( reasonet_steps, num_options, match_dim, memory.aggregation_dim, reasonet_hidden_dim, reasonet_lambda, memory_max_len=passage_len, terminate_mode=reasonet_terminate_mode, keep_first=reasonet_keep_first, logit_combine=reasonet_logit_combine) all_log_probs, all_states = reasonet_module.multiread_matching( all_match_templates, memory) # [num_steps , num_matchers, batch_size/4], [num_steps * num_matchers * batch_size, state_dim] if verbose: self.matching_vectors.append(all_states) for matcher in all_match_templates: self.matching_vectors.append( matcher.aggregation_representation) # if verbose: # self.matching_vectors+=reasonet_module.test_vectors self.rn_log_probs = all_log_probs num_matcher = len(rl_matches) total_num_gates = num_matcher * reasonet_calculated_steps # all_log_probs=tf.reshape(all_log_probs,[reasonet_calculated_steps, num_matcher,-1]) # [num_steps, num_matcher, batch_size/4] print('gate_log_prob:', gate_log_prob.get_shape()) print('all_log_probs:', all_log_probs.get_shape()) final_log_probs = tf.reshape( tf.transpose(gate_log_prob) + all_log_probs, [total_num_gates, -1]) #[num_gates, batch_size/4] self.final_log_probs = final_log_probs layout = 'question_first' if efficient else 'choice_first' gate_log_predictions = match_utils.softmax_pred( all_states, w_0, b_0, w_1, b_1, self.is_training, dropout_rate, use_options, num_options, cond_training, layout=layout, num_gates=total_num_gates ) # [num_gates * batch_size/4, num_options] # gate_log_predictions=tf.reshape(gate_log_predictions, [total_num_gates, -1, num_options]) # [num_gates, batch_size/4, num_options] if verbose: for matcher in all_match_templates: matcher.add_softmax_pred(w_0, b_0, w_1, b_1, self.is_training, dropout_rate, use_options, num_options, layout=layout) self.matching_vectors.append(matcher.log_prob) if verbose: self.all_probs = gate_log_predictions weighted_log_probs = tf.expand_dims( final_log_probs, axis=2 ) + gate_log_predictions # [num_gates, batch_size/4, num_options] self.weighted_log_probs = weighted_log_probs weighted_probs = tf.exp( weighted_log_probs ) # [num_gates, batch_size/4, num_options] self.prob = tf.reduce_sum( weighted_probs, axis=0) # [batch_size, num_options] print('finished probs') if use_options: if efficient: gold_matrix = tf.transpose( tf.reshape( self.truth, [num_options, -1])) # [batch_size, num_options] else: gold_matrix = tf.reshape( self.truth, [-1, num_options]) # [batch_size, num_options] gold_matrix = tf.cast(gold_matrix, tf.float32) self.gold_matrix = gold_matrix correct = tf.equal(tf.argmax(self.prob, 1), tf.argmax(gold_matrix, 1)) else: gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) correct = tf.nn.in_top_k(self.prob, self.truth, 1) self.correct = correct self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if rl_training_method == 'soft_voting': self.log_prob = tf.reduce_logsumexp( weighted_log_probs, axis=0) # [batch_size, num_options] self.loss = tf.negative( tf.reduce_mean( tf.reduce_sum(tf.multiply(gold_matrix, self.log_prob), axis=1))) elif rl_training_method == 'contrastive' or rl_training_method == 'contrastive_imp': reward_matrix = gold_matrix # [batch_size, num_options] baseline = tf.reduce_sum(tf.multiply(weighted_probs, reward_matrix), axis=[0, 2], keep_dims=True) # [batch_size] if rl_training_method == 'contrastive': normalized_reward = reward_matrix - baseline # [batch_size, num_options] else: normalized_reward = tf.divide( reward_matrix, baseline) - 1 # [batch_size, num_options] log_coeffs = tf.multiply(weighted_probs, normalized_reward) log_coeffs = tf.stop_gradient(log_coeffs) self.log_coeffs = log_coeffs self.weighted_log_probs = weighted_log_probs self.loss = tf.negative( tf.reduce_mean( tf.reduce_sum(tf.multiply(weighted_log_probs, log_coeffs), axis=[0, 2]))) else: logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if cond_training: logits = match_utils.apply_dropout(logits, self.is_training, dropout_rate) elif is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.multiply(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.final_logits = logits if use_options: if efficient: logits = tf.transpose(tf.reshape(logits, [num_options, -1])) gold_matrix = tf.transpose( tf.reshape(self.truth, [num_options, -1])) else: logits = tf.reshape(logits, [-1, num_options]) gold_matrix = tf.reshape(self.truth, [-1, num_options]) self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') # gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=gold_matrix)) # correct = tf.nn.in_top_k(logits, self.truth, 1) # self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) correct = tf.equal(tf.argmax(logits, 1), tf.argmax(gold_matrix, 1)) self.gold_matrix = gold_matrix self.correct = correct else: self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.correct = correct self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(list(zip(grads, tvars))) elif optimize_type == 'sgd': self.global_step = tf.Variable( 0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum( min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer( learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(list(zip(grads, tvars))) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops) with tf.name_scope('summary'): self.loss_summary = tf.summary.scalar('loss', self.loss) self.acc_summary = tf.summary.scalar('accuracy', self.eval_correct)
def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam', lambda_l2=1e-5, with_word=True, with_char=True, with_POS=True, with_NER=True, char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True, filter_layer_threshold=0.2, MP_dim=50, context_layer_num=1, aggregation_layer_num=1, fix_word_vec=False, with_filter_layer=True, with_highway=False, with_lex_features=False, lex_dim=100, word_level_MP_dim=-1, sep_endpoint=False, end_model_combine=False, with_match_highway=False, with_aggregation_highway=False, highway_layer_num=1, with_lex_decomposition=False, lex_decompsition_dim=-1, with_left_match=True, with_right_match=True, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, with_dep=True): # ======word representation layer====== in_question_repres = [] # premise in_question_dep_cons = [] # premise dependency connections in_passage_repres = [] # hypothesis in_passage_dep_cons = [] # hypothesis dependency connections self.question_lengths = tf.placeholder(tf.int32, [None]) self.passage_lengths = tf.placeholder(tf.int32, [None]) self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] input_dim = 0 # word embedding if with_word and word_vocab is not None: self.in_question_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) # in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] #print (in_question_word_repres) in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if with_dep: self.in_question_dependency = tf.placeholder( tf.float32, [None, None, word_vocab.parser.typesize ]) # [batch_size, question_len, dep_dim] self.in_passage_dependency = tf.placeholder( tf.float32, [None, None, word_vocab.parser.typesize ]) # [batch_size, passage_len, dep_dim] self.in_question_dep_con = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_dep_con = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #dependency representation is the same as data input in_question_dep_repres = self.in_question_dependency in_passage_dep_repres = self.in_passage_dependency in_question_repres.append(in_question_dep_repres) in_passage_repres.append(in_passage_dep_repres) input_dim += word_vocab.parser.typesize # dependency_dim # embedding dependency later here #get dependency connections, do smth here? otherwise just pass self.in_question_dep_con to matching function in_question_dep_cons = self.in_question_dep_con in_passage_dep_cons = self.in_passage_dep_con #if with_image: # self. if with_POS and POS_vocab is not None: self.in_question_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += POS_vocab.word_dim if with_NER and NER_vocab is not None: self.in_question_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += NER_vocab.word_dim if with_char and char_vocab is not None: self.question_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.passage_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_question_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] self.in_passage_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim # self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - dropout_rate)) char_lstm_cell = tf.nn.rnn_cell.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:, -1, :] question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:, -1, :] passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += char_lstm_dim #print('\n\n\n') #print (in_question_repres) #print('\n\n\n') in_question_repres = tf.concat( 2, in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( 2, in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) else: in_question_repres = tf.mul(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.mul(in_passage_repres, (1 - dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func2( in_question_repres, in_passage_repres, in_question_dep_cons, in_passage_dep_cons, self.question_lengths, self.passage_lengths, question_mask, mask, MP_dim, input_dim, with_filter_layer, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_lex_decomposition, lex_decompsition_dim, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, with_left_match, with_right_match, with_dep=with_dep) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.mul(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif optimize_type == 'sgd': self.global_step = tf.Variable( 0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum( min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer( learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def encode(self, is_training=True): options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if options.with_word and self.word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.variable_scope("embedding"), tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(self.word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += self.word_vocab.word_dim if options.with_char and self.char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = self.char_vocab.word_dim self.char_embedding = tf.get_variable( "char_embedding", initializer=tf.constant(self.char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell( options.char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - options.dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = tf.nn.dynamic_rnn( char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:, -1, :] question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, options.char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = tf.nn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:, -1, :] passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, options.char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += options.char_lstm_dim if options.with_POS and self.POS_vocab is not None: self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( self.POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += self.POS_vocab.word_dim if options.with_NER and self.NER_vocab is not None: self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( self.NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += self.NER_vocab.word_dim in_question_repres = tf.concat(in_question_repres, 2) # [batch_size, question_len, dim] in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] if options.compress_input: # compress input word vector into smaller vectors w_compress = tf.get_variable( "w_compress_input", [input_dim, options.compress_input_dim], dtype=tf.float32) b_compress = tf.get_variable("b_compress_input", [options.compress_input_dim], dtype=tf.float32) in_question_repres = tf.reshape(in_question_repres, [-1, input_dim]) in_question_repres = tf.matmul(in_question_repres, w_compress) + b_compress in_question_repres = tf.tanh(in_question_repres) in_question_repres = tf.reshape( in_question_repres, [batch_size, question_len, options.compress_input_dim]) in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim]) in_passage_repres = tf.matmul(in_passage_repres, w_compress) + b_compress in_passage_repres = tf.tanh(in_passage_repres) in_passage_repres = tf.reshape( in_passage_repres, [batch_size, passage_len, options.compress_input_dim]) input_dim = options.compress_input_dim if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) else: in_question_repres = tf.multiply(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.multiply(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # ======Filter layer====== cosine_matrix = match_utils.cal_relevancy_matrix( in_question_repres, in_passage_repres) cosine_matrix = match_utils.mask_relevancy_matrix( cosine_matrix, question_mask, passage_mask) # relevancy_matrix = tf.select(tf.greater(cosine_matrix, # tf.scalar_mul(filter_layer_threshold, tf.ones_like(cosine_matrix, dtype=tf.float32))), # cosine_matrix, tf.zeros_like(cosine_matrix, dtype=tf.float32)) # [batch_size, passage_len, question_len] raw_in_passage_repres = in_passage_repres if options.with_filter_layer: relevancy_matrix = cosine_matrix # [batch_size, passage_len, question_len] relevancy_degrees = tf.reduce_max( relevancy_matrix, axis=2) # [batch_size, passage_len] relevancy_degrees = tf.expand_dims( relevancy_degrees, axis=-1) # [batch_size, passage_len, 'x'] in_passage_repres = tf.multiply(in_passage_repres, relevancy_degrees) # =======Context Representation Layer & Multi-Perspective matching layer===== all_question_aware_representatins = [] question_aware_dim = 0 if options.with_word_match: with tf.variable_scope('word_level_matching'): (word_match_vectors, word_match_dim) = match_utils.match_passage_with_question( raw_in_passage_repres, None, passage_mask, in_question_repres, None, question_mask, input_dim, with_full_matching=False, with_attentive_matching=options.with_attentive_matching, with_max_attentive_matching=options. with_max_attentive_matching, with_maxpooling_matching=options.with_maxpooling_matching, with_local_attentive_matching=options. with_local_attentive_matching, win_size=options.win_size, with_forward_match=True, with_backward_match=False, match_options=options) all_question_aware_representatins.extend(word_match_vectors) question_aware_dim += word_match_dim # lex decomposition if options.with_lex_decomposition: lex_decomposition = match_utils.cal_linear_decomposition_representation( raw_in_passage_repres, self.passage_lengths, cosine_matrix, is_training, options.lex_decompsition_dim, options.dropout_rate) all_question_aware_representatins.append(lex_decomposition) if options.lex_decompsition_dim == -1: question_aware_dim += 2 * input_dim else: question_aware_dim += 2 * options.lex_decompsition_dim if options.with_question_passage_word_feature: all_question_aware_representatins.append(raw_in_passage_repres) att_question_representation = match_utils.calculate_cosine_weighted_question_representation( in_question_repres, cosine_matrix) all_question_aware_representatins.append( att_question_representation) question_aware_dim += 2 * input_dim # sequential context matching question_forward = None question_backward = None passage_forward = None passage_backward = None if options.with_sequential_match: with tf.variable_scope('context_MP_matching'): cur_in_question_repres = in_question_repres cur_in_passage_repres = in_passage_repres for i in xrange(options.context_layer_num): with tf.variable_scope('layer-{}'.format(i)): with tf.variable_scope('context_represent'): # parameters context_lstm_cell_fw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) context_lstm_cell_bw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) if is_training: context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_fw, output_keep_prob=(1 - options.dropout_rate)) context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_bw, output_keep_prob=(1 - options.dropout_rate)) # question representation ((question_context_representation_fw, question_context_representation_bw), (question_forward, question_backward )) = tf.nn.bidirectional_dynamic_rnn( context_lstm_cell_fw, context_lstm_cell_bw, cur_in_question_repres, dtype=tf.float32, sequence_length=self.question_lengths ) # [batch_size, question_len, context_lstm_dim] cur_in_question_repres = tf.concat([ question_context_representation_fw, question_context_representation_bw ], 2) # passage representation tf.get_variable_scope().reuse_variables() ((passage_context_representation_fw, passage_context_representation_bw), (passage_forward, passage_backward )) = tf.nn.bidirectional_dynamic_rnn( context_lstm_cell_fw, context_lstm_cell_bw, cur_in_passage_repres, dtype=tf.float32, sequence_length=self.passage_lengths ) # [batch_size, passage_len, context_lstm_dim] cur_in_passage_repres = tf.concat([ passage_context_representation_fw, passage_context_representation_bw ], 2) # Multi-perspective matching with tf.variable_scope('MP_matching'): (matching_vectors, matching_dim ) = match_utils.match_passage_with_question( passage_context_representation_fw, passage_context_representation_bw, passage_mask, question_context_representation_fw, question_context_representation_bw, question_mask, options.context_lstm_dim, with_full_matching=options.with_full_matching, with_attentive_matching=options. with_attentive_matching, with_max_attentive_matching=options. with_max_attentive_matching, with_maxpooling_matching=options. with_maxpooling_matching, with_local_attentive_matching=options. with_local_attentive_matching, win_size=options.win_size, with_forward_match=options.with_forward_match, with_backward_match=options. with_backward_match, match_options=options) all_question_aware_representatins.extend( matching_vectors) question_aware_dim += matching_dim all_question_aware_representatins = tf.concat( all_question_aware_representatins, 2) # [batch_size, passage_len, dim] if is_training: all_question_aware_representatins = tf.nn.dropout( all_question_aware_representatins, (1 - options.dropout_rate)) else: all_question_aware_representatins = tf.multiply( all_question_aware_representatins, (1 - options.dropout_rate)) # ======Highway layer====== if options.with_match_highway: with tf.variable_scope("matching_highway"): all_question_aware_representatins = match_utils.multi_highway_layer( all_question_aware_representatins, question_aware_dim, options.highway_layer_num) #========Aggregation Layer====== if not options.with_aggregation: aggregation_representation = all_question_aware_representatins aggregation_dim = question_aware_dim else: aggregation_representation = [] aggregation_dim = 0 aggregation_input = all_question_aware_representatins with tf.variable_scope('aggregation_layer'): for i in xrange(options.aggregation_layer_num): with tf.variable_scope('layer-{}'.format(i)): aggregation_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell( options.aggregation_lstm_dim) aggregation_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell( options.aggregation_lstm_dim) if is_training: aggregation_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper( aggregation_lstm_cell_fw, output_keep_prob=(1 - options.dropout_rate)) aggregation_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper( aggregation_lstm_cell_bw, output_keep_prob=(1 - options.dropout_rate)) aggregation_lstm_cell_fw = tf.contrib.rnn.MultiRNNCell( [aggregation_lstm_cell_fw]) aggregation_lstm_cell_bw = tf.contrib.rnn.MultiRNNCell( [aggregation_lstm_cell_bw]) cur_aggregation_representation, _ = rnn.bidirectional_dynamic_rnn( aggregation_lstm_cell_fw, aggregation_lstm_cell_bw, aggregation_input, dtype=tf.float32, sequence_length=self.passage_lengths) cur_aggregation_representation = tf.concat( cur_aggregation_representation, 2 ) # [batch_size, passage_len, 2*aggregation_lstm_dim] aggregation_representation.append( cur_aggregation_representation) aggregation_dim += 2 * options.aggregation_lstm_dim aggregation_input = cur_aggregation_representation aggregation_representation = tf.concat(aggregation_representation, 2) aggregation_representation = tf.concat([ aggregation_representation, all_question_aware_representatins ], 2) aggregation_dim += question_aware_dim # ======Highway layer====== if options.with_aggregation_highway: with tf.variable_scope("aggregation_highway"): aggregation_representation = match_utils.multi_highway_layer( aggregation_representation, aggregation_dim, options.highway_layer_num) #========output Layer========= encode_size = aggregation_dim + input_dim encode_hiddens = tf.concat( [aggregation_representation, in_passage_repres], 2) # [batch_size, passage_len, enc_size] encode_hiddens = encode_hiddens * tf.expand_dims(passage_mask, axis=-1) # initial state for the LSTM decoder #''' with tf.variable_scope('initial_state_for_decoder'): # Define weights and biases to reduce the cell and reduce the state w_reduce_c = tf.get_variable( 'w_reduce_c', [4 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) w_reduce_h = tf.get_variable( 'w_reduce_h', [4 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) bias_reduce_c = tf.get_variable('bias_reduce_c', [options.gen_hidden_size], dtype=tf.float32) bias_reduce_h = tf.get_variable('bias_reduce_h', [options.gen_hidden_size], dtype=tf.float32) old_c = tf.concat(values=[ question_forward.c, question_backward.c, passage_forward.c, passage_backward.c ], axis=1) old_h = tf.concat(values=[ question_forward.h, question_backward.h, passage_forward.h, passage_backward.h ], axis=1) new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c) new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h) init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) ''' new_c = tf.zeros([batch_size, options.gen_hidden_size]) new_h = tf.zeros([batch_size, options.gen_hidden_size]) init_state = LSTMStateTuple(new_c, new_h) ''' return (encode_size, encode_hiddens, init_state)
def __init__(self, word_vocab=None, edge_label_vocab=None, char_vocab=None, is_training=True, options=None): assert options != None self.passage_nodes_size = tf.placeholder(tf.int32, [None]) # [batch_size] self.passage_nodes = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_nodes_size_max] if options.with_char: self.passage_nodes_chars_size = tf.placeholder( tf.int32, [None, None]) self.passage_nodes_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_nodes_size_max, passage_neighbors_size_max] self.passage_in_neighbor_indices = tf.placeholder( tf.int32, [None, None, None]) self.passage_in_neighbor_hidden_indices = tf.placeholder( tf.int32, [None, None, None]) self.passage_in_neighbor_edges = tf.placeholder( tf.int32, [None, None, None]) self.passage_in_neighbor_mask = tf.placeholder(tf.float32, [None, None, None]) # shapes input_shape = tf.shape(self.passage_in_neighbor_indices) batch_size = input_shape[0] passage_nodes_size_max = input_shape[1] passage_in_neighbors_size_max = input_shape[2] if options.with_char: passage_nodes_chars_size_max = tf.shape( self.passage_nodes_chars)[2] # masks # [batch_size, passage_nodes_size_max] self.passage_nodes_mask = tf.sequence_mask(self.passage_nodes_size, passage_nodes_size_max, dtype=tf.float32) # embeddings word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant( word_vocab.word_vecs), dtype=tf.float32) self.edge_embedding = tf.get_variable("edge_embedding", initializer=tf.constant( edge_label_vocab.word_vecs), dtype=tf.float32) word_dim = word_vocab.word_dim edge_dim = edge_label_vocab.word_dim if options.with_char: self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) char_dim = char_vocab.word_dim # word representation for nodes, where each node only includes one word # [batch_size, passage_nodes_size_max, word_dim] passage_node_representation = tf.nn.embedding_lookup( self.word_embedding, self.passage_nodes) if options.with_char: # [batch_size, passage_nodes_size_max, passage_nodes_chars_size_max, char_dim] passage_nodes_chars_representation = tf.nn.embedding_lookup( self.char_embedding, self.passage_nodes_chars) passage_nodes_chars_representation = tf.reshape( passage_nodes_chars_representation, shape=[ batch_size * passage_nodes_size_max, passage_nodes_chars_size_max, char_dim ]) passage_nodes_chars_size = tf.reshape( self.passage_nodes_chars_size, [batch_size * passage_nodes_size_max]) with tf.variable_scope('node_char_lstm'): node_char_lstm_cell = tf.contrib.rnn.LSTMCell( options.char_lstm_dim) node_char_lstm_cell = tf.contrib.rnn.MultiRNNCell( [node_char_lstm_cell]) # [batch_size*node_num, char_num, char_lstm_dim] node_char_outputs = tf.nn.dynamic_rnn( node_char_lstm_cell, passage_nodes_chars_representation, sequence_length=passage_nodes_chars_size, dtype=tf.float32)[0] node_char_outputs = collect_final_step_lstm( node_char_outputs, passage_nodes_chars_size - 1) # [batch_size, node_num, char_lstm_dim] node_char_outputs = tf.reshape(node_char_outputs, [ batch_size, passage_nodes_size_max, options.char_lstm_dim ]) if options.with_char: input_dim = word_dim + options.char_lstm_dim passage_node_representation = tf.concat( [passage_node_representation, node_char_outputs], 2) else: input_dim = word_dim passage_node_representation = passage_node_representation # apply the mask passage_node_representation = passage_node_representation * tf.expand_dims( self.passage_nodes_mask, axis=-1) if options.compress_input: # compress input word vector into smaller vectors w_compress = tf.get_variable( "w_compress_input", [input_dim, options.compress_input_dim], dtype=tf.float32) b_compress = tf.get_variable("b_compress_input", [options.compress_input_dim], dtype=tf.float32) passage_node_representation = tf.reshape( passage_node_representation, [-1, input_dim]) passage_node_representation = tf.matmul( passage_node_representation, w_compress) + b_compress passage_node_representation = tf.tanh(passage_node_representation) passage_node_representation = tf.reshape(passage_node_representation, \ [batch_size, passage_nodes_size_max, options.compress_input_dim]) input_dim = options.compress_input_dim if is_training: passage_node_representation = tf.nn.dropout( passage_node_representation, (1 - options.dropout_rate)) # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): passage_node_representation = match_utils.multi_highway_layer( passage_node_representation, input_dim, options.highway_layer_num) # =========== in neighbor # [batch_size, passage_len, passage_neighbors_size_max, edge_dim] passage_in_neighbor_edge_representations = tf.nn.embedding_lookup( self.edge_embedding, self.passage_in_neighbor_edges) # [batch_size, passage_len, passage_neighbors_size_max, node_dim] passage_in_neighbor_node_representations = collect_neighbor_node_representations( passage_node_representation, self.passage_in_neighbor_indices) passage_in_neighbor_representations = tf.concat( \ [passage_in_neighbor_node_representations, passage_in_neighbor_edge_representations], 3) passage_in_neighbor_representations = tf.multiply( passage_in_neighbor_representations, tf.expand_dims(self.passage_in_neighbor_mask, axis=-1)) # [batch_size, passage_len, node_dim + edge_dim] passage_in_neighbor_representations = tf.reduce_sum( passage_in_neighbor_representations, axis=2) # =====transform neighbor_representations dag_hidden_dim = options.neighbor_vector_dim w_trans = tf.get_variable("w_trans", [input_dim + edge_dim, dag_hidden_dim], dtype=tf.float32) b_trans = tf.get_variable("b_trans", [dag_hidden_dim], dtype=tf.float32) passage_in_neighbor_representations = tf.reshape( passage_in_neighbor_representations, [-1, input_dim + edge_dim]) passage_in_neighbor_representations = tf.matmul( passage_in_neighbor_representations, w_trans) + b_trans passage_in_neighbor_representations = tf.tanh( passage_in_neighbor_representations) passage_in_neighbor_representations = tf.reshape( passage_in_neighbor_representations, [batch_size, passage_nodes_size_max, dag_hidden_dim]) passage_in_neighbor_representations = tf.multiply( passage_in_neighbor_representations, tf.expand_dims(self.passage_nodes_mask, axis=-1)) with tf.variable_scope('gated_operations'): w_in_ingate = tf.get_variable("w_in_ingate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) u_in_ingate = tf.get_variable("u_in_ingate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) b_ingate = tf.get_variable("b_in_ingate", [dag_hidden_dim], dtype=tf.float32) w_in_forgetgate = tf.get_variable("w_in_forgetgate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) u_in_forgetgate = tf.get_variable("u_in_forgetgate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) b_forgetgate = tf.get_variable("b_in_forgetgate", [dag_hidden_dim], dtype=tf.float32) w_in_outgate = tf.get_variable("w_in_outgate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) u_in_outgate = tf.get_variable("u_in_outgate", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) b_outgate = tf.get_variable("b_in_outgate", [dag_hidden_dim], dtype=tf.float32) w_in_cell = tf.get_variable("w_in_cell", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) u_in_cell = tf.get_variable("u_in_cell", [dag_hidden_dim, dag_hidden_dim], dtype=tf.float32) b_cell = tf.get_variable("b_in_cell", [dag_hidden_dim], dtype=tf.float32) # assume each node has a neighbor vector, and it is None at the beginning passage_node_hidden = tf.zeros([batch_size, 1, dag_hidden_dim]) passage_node_cell = tf.zeros([batch_size, 1, dag_hidden_dim]) idx_var = tf.constant(0) #tf.Variable(0,trainable=False) # body function def _recurrence(passage_node_hidden, passage_node_cell, idx_var): # [batch_size, neighbor_size] prev_mask = tf.gather(self.passage_in_neighbor_mask, idx_var, axis=1) # [batch_size] node_mask = tf.gather(self.passage_nodes_mask, idx_var, axis=1) # [batch_size, neighbor_size] prev_idx = tf.gather(self.passage_in_neighbor_hidden_indices, idx_var, axis=1) # [batch_size, input_dim] prev_input = tf.gather(passage_in_neighbor_representations, idx_var, axis=1) # [batch_size, neighbor_size, dag_hidden_dim] prev_hidden = collect_neighbor_node_representations_2D( passage_node_hidden, prev_idx) prev_hidden = tf.multiply(prev_hidden, tf.expand_dims(prev_mask, axis=-1)) # [batch_size, dag_hidden_dim] prev_hidden = tf.reduce_sum(prev_hidden, axis=1) prev_hidden = tf.multiply(prev_hidden, tf.expand_dims(node_mask, axis=-1)) # [batch_size, neighbor_size, dag_hidden_dim] prev_cell = collect_neighbor_node_representations_2D( passage_node_cell, prev_idx) prev_cell = tf.multiply(prev_cell, tf.expand_dims(prev_mask, axis=-1)) # [batch_size, dag_hidden_dim] prev_cell = tf.reduce_sum(prev_cell, axis=1) prev_cell = tf.multiply(prev_cell, tf.expand_dims(node_mask, axis=-1)) ## ig passage_edge_ingate = tf.sigmoid( tf.matmul(prev_input, w_in_ingate) + tf.matmul(prev_hidden, u_in_ingate) + b_ingate) ## fg passage_edge_forgetgate = tf.sigmoid( tf.matmul(prev_input, w_in_forgetgate) + tf.matmul(prev_hidden, u_in_forgetgate) + b_forgetgate) ## og passage_edge_outgate = tf.sigmoid( tf.matmul(prev_input, w_in_outgate) + tf.matmul(prev_hidden, u_in_outgate) + b_outgate) ## input passage_edge_input = tf.tanh( tf.matmul(prev_input, w_in_cell) + tf.matmul(prev_hidden, u_in_cell) + b_cell) # calculating new cell and hidden passage_edge_cell = passage_edge_forgetgate * prev_cell + passage_edge_ingate * passage_edge_input passage_edge_hidden = passage_edge_outgate * tf.tanh( passage_edge_cell) # node mask passage_edge_cell = tf.multiply( passage_edge_cell, tf.expand_dims(node_mask, axis=-1)) passage_edge_hidden = tf.multiply( passage_edge_hidden, tf.expand_dims(node_mask, axis=-1)) # [batch_size, 1, dag_hidden_dim] passage_edge_cell = tf.expand_dims(passage_edge_cell, axis=1) passage_edge_hidden = tf.expand_dims(passage_edge_hidden, axis=1) # concatenating new staff passage_node_hidden = tf.concat( [passage_node_hidden, passage_edge_hidden], axis=1) passage_node_cell = tf.concat( [passage_node_cell, passage_edge_cell], axis=1) idx_var = tf.add(idx_var, 1) return passage_node_hidden, passage_node_cell, idx_var loop_condition = lambda a1, b1, idx_var: tf.less( idx_var, passage_nodes_size_max) loop_vars = [passage_node_hidden, passage_node_cell, idx_var] passage_node_hidden, passage_node_cell, idx_var = tf.while_loop( loop_condition, _recurrence, loop_vars, parallel_iterations=1, shape_invariants=[ tf.TensorShape([None, None, dag_hidden_dim]), tf.TensorShape([None, None, dag_hidden_dim]), idx_var.get_shape(), ]) # decide how to use graph_representations self.node_representations = passage_node_representation self.graph_hiddens = passage_node_hidden self.graph_cells = passage_node_cell self.batch_size = batch_size
def create_siameseLSTM_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): """ """ options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.embedding = tf.placeholder( tf.float32, shape=word_vocab.word_vecs.shape) self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=self.embedding, dtype=tf.float32) # tf.constant(word_vocab.word_vecs) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # ======BiLSTM context layer====== for i in range( options.context_layer_num): # support multiple context layer with tf.variable_scope('bilstm-layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=self.question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Encode the second sentence, using the same LSTM weights. tf.get_variable_scope().reuse_variables() in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=self.passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) if options.lstm_out_type == 'mean': question_context_representation_fw = layer_utils.collect_mean_step_of_lstm( question_context_representation_fw) question_context_representation_bw = layer_utils.collect_mean_step_of_lstm( question_context_representation_bw) passage_context_representation_fw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_fw) passage_context_representation_bw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_bw) elif options.lstm_out_type == 'end': question_context_representation_fw = layer_utils.collect_final_step_of_lstm( question_context_representation_fw, self.question_lengths - 1) question_context_representation_bw = question_context_representation_bw[:, 0, :] passage_context_representation_fw = layer_utils.collect_final_step_of_lstm( passage_context_representation_fw, self.passage_lengths - 1) passage_context_representation_bw = passage_context_representation_bw[:, 0, :] question_context_outputs = tf.concat( axis=1, values=[ question_context_representation_fw, question_context_representation_bw ]) passage_context_outputs = tf.concat( axis=1, values=[ passage_context_representation_fw, passage_context_representation_bw ]) (match_representation, match_dim) = match_utils.siameseLSTM_match_func( question_context_outputs, passage_context_outputs, options.context_lstm_dim) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)], dtype=tf.float32) b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32) w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l1 > 0.0: l1_loss = tf.add_n([ tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l1_loss if self.options.lambda_l2 > 0.0: # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) l2_loss = tf.add_n([ tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def create_mpcnn_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): """ """ options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.embedding = tf.placeholder( tf.float32, shape=word_vocab.word_vecs.shape) self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=self.embedding, dtype=tf.float32) # tf.constant(word_vocab.word_vecs) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) in_question_repres = tf.expand_dims( in_question_repres, -1) # [batch_size, question_len, word_dim, 1] in_passage_repres = tf.expand_dims( in_passage_repres, -1) # [batch_size, passage_len, word_dim, 1] # ======Multi-perspective CNN Matching====== filter_sizes = options.filter_sizes num_filters = options.num_filters poolings = list([tf.reduce_max, tf.reduce_min, tf.reduce_mean])[:options.num_poolings] W1 = [ tf.get_variable( "W1_%s" % i, initializer=tf.truncated_normal( [filter_sizes[i], input_dim, 1, num_filters[0]], stddev=0.1), dtype=tf.float32) for i in range(len(filter_sizes)) ] b1 = [ tf.get_variable("b1_%s" % i, initializer=tf.constant(0.01, shape=[num_filters[0]]), dtype=tf.float32) for i in range(len(filter_sizes)) ] W2 = [ tf.get_variable( "W2_%s" % i, initializer=tf.truncated_normal( [filter_sizes[i], input_dim, 1, num_filters[1]], stddev=0.1), dtype=tf.float32) for i in range(len(filter_sizes) - 1) ] b2 = [ tf.get_variable( "b2_%s" % i, initializer=tf.constant(0.01, shape=[num_filters[1], input_dim]), dtype=tf.float32) for i in range(len(filter_sizes) - 1) ] sent1_blockA = layer_utils.build_block_A( in_question_repres, filter_sizes, poolings, W1, b1, is_training ) # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A] sent2_blockA = layer_utils.build_block_A( in_passage_repres, filter_sizes, poolings, W1, b1, is_training ) # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A] sent1_blockB = layer_utils.build_block_B( in_question_repres, filter_sizes, poolings, W2, b2, is_training ) # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B] sent2_blockB = layer_utils.build_block_B( in_passage_repres, filter_sizes, poolings, W2, b2, is_training ) # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B] (match_representation, match_dim) = match_utils.mpcnn_match_func( sent1_blockA, sent2_blockA, sent1_blockB, sent2_blockB, poolings, filter_sizes, num_filters) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)], dtype=tf.float32) b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32) w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return if options.with_f1_metric: # acc, acc_op = tf.metrics.accuracy(labels=self.truth, predictions=self.predictions) precision, pre_op = tf.metrics.precision( labels=self.truth, predictions=self.predictions) recall, rec_op = tf.metrics.recall(labels=self.truth, predictions=self.predictions) f1 = 2 * precision * recall / (precision + recall + 1e-6) self.loss = self.loss - 0.1 * tf.reduce_mean(f1) tvars = tf.trainable_variables() if self.options.lambda_l1 > 0.0: l1_loss = tf.add_n([ tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l1_loss if self.options.lambda_l2 > 0.0: # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) l2_loss = tf.add_n([ tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def gcn_encode(self, batch_nodes, embedded_node_rep, fw_adj_info, bw_adj_info, input_node_dim, output_node_dim, fw_aggregators, bw_aggregators, window_size, layer_size, scope, agg_type, sample_size_per_layer, keep_inter_state=False): with tf.variable_scope(scope): single_graph_nodes_size = tf.shape(batch_nodes)[1] # ============ encode graph structure ========== fw_sampler = UniformNeighborSampler(fw_adj_info) bw_sampler = UniformNeighborSampler(bw_adj_info) nodes = tf.reshape(batch_nodes, [ -1, ]) # the fw_hidden and bw_hidden is the initial node embedding # [node_size, dim_size] fw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes) bw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes) # [node_size, adj_size] fw_sampled_neighbors = fw_sampler((nodes, sample_size_per_layer)) bw_sampled_neighbors = bw_sampler((nodes, sample_size_per_layer)) inter_fw_hiddens = [] inter_bw_hiddens = [] inter_dims = [] if scope == "first_gcn": self.watch["node_1_rep_in_first_gcn"] = [] fw_hidden_dim = input_node_dim # layer is the index of convolution and hop is used to combine information for layer in range(layer_size): self.watch["node_1_rep_in_first_gcn"].append(fw_hidden) if len(fw_aggregators) <= layer: fw_aggregators.append([]) if len(bw_aggregators) <= layer: bw_aggregators.append([]) for hop in range(window_size): if hop > 6: fw_aggregator = fw_aggregators[layer][6] elif len(fw_aggregators[layer]) > hop: fw_aggregator = fw_aggregators[layer][hop] else: if agg_type == "GCN": fw_aggregator = GCNAggregator(fw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "mean_pooling": fw_aggregator = MeanAggregator( fw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, if_use_high_way=self.with_gcn_highway, mode=self.mode) elif agg_type == "max_pooling": fw_aggregator = MaxPoolingAggregator( fw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "lstm": fw_aggregator = SeqAggregator(fw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "att": fw_aggregator = AttentionAggregator( fw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) fw_aggregators[layer].append(fw_aggregator) # [node_size, adj_size, word_embedding_dim] if layer == 0 and hop == 0: neigh_vec_hidden = tf.nn.embedding_lookup( embedded_node_rep, fw_sampled_neighbors) else: neigh_vec_hidden = tf.nn.embedding_lookup( tf.concat( [fw_hidden, tf.zeros([1, fw_hidden_dim])], 0), fw_sampled_neighbors) # if self.with_gcn_highway: # # we try to forget something when introducing the neighbor information # with tf.variable_scope("fw_hidden_highway"): # fw_hidden = multi_highway_layer(fw_hidden, fw_hidden_dim, options['highway_layer_num']) bw_hidden_dim = fw_hidden_dim fw_hidden, fw_hidden_dim = fw_aggregator( (fw_hidden, neigh_vec_hidden)) if keep_inter_state: inter_fw_hiddens.append(fw_hidden) inter_dims.append(fw_hidden_dim) if self.graph_encode_direction == "bi": if hop > 6: bw_aggregator = bw_aggregators[layer][6] elif len(bw_aggregators[layer]) > hop: bw_aggregator = bw_aggregators[layer][hop] else: if agg_type == "GCN": bw_aggregator = GCNAggregator( bw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "mean_pooling": bw_aggregator = MeanAggregator( bw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, if_use_high_way=self.with_gcn_highway, mode=self.mode) elif agg_type == "max_pooling": bw_aggregator = MaxPoolingAggregator( bw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "lstm": bw_aggregator = SeqAggregator( bw_hidden_dim, output_node_dim, concat=self.concat, dropout=self.dropout, mode=self.mode) elif agg_type == "att": bw_aggregator = AttentionAggregator( bw_hidden_dim, output_node_dim, concat=self.concat, mode=self.mode, dropout=self.dropout) bw_aggregators[layer].append(bw_aggregator) if layer == 0 and hop == 0: neigh_vec_hidden = tf.nn.embedding_lookup( embedded_node_rep, bw_sampled_neighbors) else: neigh_vec_hidden = tf.nn.embedding_lookup( tf.concat( [bw_hidden, tf.zeros([1, fw_hidden_dim])], 0), bw_sampled_neighbors) if self.with_gcn_highway: with tf.variable_scope("bw_hidden_highway"): bw_hidden = multi_highway_layer( bw_hidden, fw_hidden_dim, options['highway_layer_num']) bw_hidden, bw_hidden_dim = bw_aggregator( (bw_hidden, neigh_vec_hidden)) if keep_inter_state: inter_bw_hiddens.append(bw_hidden) node_dim = fw_hidden_dim # hidden stores the representation for all nodes fw_hidden = tf.reshape(fw_hidden, [-1, single_graph_nodes_size, node_dim]) if self.graph_encode_direction == "bi": bw_hidden = tf.reshape(bw_hidden, [-1, single_graph_nodes_size, node_dim]) hidden = tf.concat([fw_hidden, bw_hidden], axis=2) graph_dim = 2 * node_dim else: hidden = fw_hidden graph_dim = node_dim hidden = tf.nn.relu(hidden) max_pooled = tf.reduce_max(hidden, 1) mean_pooled = tf.reduce_mean(hidden, 1) res = [hidden] max_graph_embedding = tf.reshape(max_pooled, [-1, graph_dim]) mean_graph_embedding = tf.reshape(mean_pooled, [-1, graph_dim]) res.append(max_graph_embedding) res.append(mean_graph_embedding) res.append(graph_dim) if keep_inter_state: inter_node_reps = [] inter_graph_reps = [] inter_graph_dims = [] # process the inter hidden states for _ in range(len(inter_fw_hiddens)): inter_fw_hidden = inter_fw_hiddens[_] inter_bw_hidden = inter_bw_hiddens[_] inter_dim = inter_dims[_] inter_fw_hidden = tf.reshape( inter_fw_hidden, [-1, single_graph_nodes_size, inter_dim]) if self.graph_encode_direction == "bi": inter_bw_hidden = tf.reshape( inter_bw_hidden, [-1, single_graph_nodes_size, inter_dim]) inter_hidden = tf.concat( [inter_fw_hidden, inter_bw_hidden], axis=2) inter_graph_dim = inter_dim * 2 else: inter_hidden = inter_fw_hidden inter_graph_dim = inter_dim inter_node_rep = tf.nn.relu(inter_hidden) inter_node_reps.append(inter_node_rep) inter_graph_dims.append(inter_graph_dim) max_pooled_tmp = tf.reduce_max(inter_node_rep, 1) mean_pooled_tmp = tf.reduce_max(inter_node_rep, 1) max_graph_embedding = tf.reshape(max_pooled_tmp, [-1, inter_graph_dim]) mean_graph_embedding = tf.reshape(mean_pooled_tmp, [-1, inter_graph_dim]) inter_graph_reps.append( (max_graph_embedding, mean_graph_embedding)) res.append(inter_node_reps) res.append(inter_graph_reps) res.append(inter_graph_dims) return res
def _build_graph(self): node_1_mask = self.batch_mask_first node_2_mask = self.batch_mask_second node_1_looking_table = self.looking_table_first node_2_looking_table = self.looking_table_second node_2_aware_representations = [] node_2_aware_dim = 0 node_1_aware_representations = [] node_1_aware_dim = 0 pad_word_embedding = tf.zeros([1, self.word_embedding_dim ]) # this is for the PAD symbol self.word_embeddings = tf.concat([ pad_word_embedding, tf.get_variable( 'pretrained_embedding', shape=[self.pretrained_word_size, self.word_embedding_dim], initializer=tf.constant_initializer( self.pretrained_word_embeddings), trainable=True), tf.get_variable( 'W_train', shape=[self.learned_word_size, self.word_embedding_dim], initializer=tf.contrib.layers.xavier_initializer(), trainable=True) ], 0) self.watch['word_embeddings'] = self.word_embeddings # ============ encode node feature by looking up word embedding ============= with tf.variable_scope('node_rep_gen'): # [node_size, hidden_layer_dim] feature_embedded_chars_first = tf.nn.embedding_lookup( self.word_embeddings, self.feature_info_first) graph_1_size = tf.shape(feature_embedded_chars_first)[0] feature_embedded_chars_second = tf.nn.embedding_lookup( self.word_embeddings, self.feature_info_second) graph_2_size = tf.shape(feature_embedded_chars_second)[0] if self.node_vec_method == "lstm": cell = self.build_encoder_cell(1, self.hidden_layer_dim) outputs, hidden_states = tf.nn.dynamic_rnn( cell=cell, inputs=feature_embedded_chars_first, sequence_length=self.feature_len_first, dtype=tf.float32) node_1_rep = layer_utils.collect_final_step_of_lstm( outputs, self.feature_len_first - 1) outputs, hidden_states = tf.nn.dynamic_rnn( cell=cell, inputs=feature_embedded_chars_second, sequence_length=self.feature_len_second, dtype=tf.float32) node_2_rep = layer_utils.collect_final_step_of_lstm( outputs, self.feature_len_second - 1) elif self.node_vec_method == "word_emb": node_1_rep = tf.reshape(feature_embedded_chars_first, [graph_1_size, -1]) node_2_rep = tf.reshape(feature_embedded_chars_second, [graph_2_size, -1]) self.watch["node_1_rep_initial"] = node_1_rep # ============ encode node feature by GCN ============= with tf.variable_scope('first_gcn') as first_gcn_scope: # shape of node embedding: [batch_size, single_graph_nodes_size, node_embedding_dim] # shape of node size: [batch_size] gcn_1_res = self.gcn_encode( self.batch_nodes_first, node_1_rep, self.fw_adj_info_first, self.bw_adj_info_first, input_node_dim=self.word_embedding_dim, output_node_dim=self.aggregator_dim_first, fw_aggregators=self.fw_aggregators_first, bw_aggregators=self.bw_aggregators_first, window_size=self.gcn_window_size_first, layer_size=self.gcn_layer_size_first, scope="first_gcn", agg_type=self.agg_type_first, sample_size_per_layer=self.sample_size_per_layer_first, keep_inter_state=self.if_use_multiple_gcn_1_state) node_1_rep = gcn_1_res[0] node_1_rep_dim = gcn_1_res[3] gcn_2_res = self.gcn_encode( self.batch_nodes_second, node_2_rep, self.fw_adj_info_second, self.bw_adj_info_second, input_node_dim=self.word_embedding_dim, output_node_dim=self.aggregator_dim_first, fw_aggregators=self.fw_aggregators_first, bw_aggregators=self.bw_aggregators_first, window_size=self.gcn_window_size_first, layer_size=self.gcn_layer_size_first, scope="first_gcn", agg_type=self.agg_type_first, sample_size_per_layer=self.sample_size_per_layer_second, keep_inter_state=self.if_use_multiple_gcn_1_state) node_2_rep = gcn_2_res[0] node_2_rep_dim = gcn_2_res[3] self.watch["node_1_rep_first_GCN"] = node_1_rep self.watch["node_1_mask"] = node_1_mask # mask node_1_rep = tf.multiply(node_1_rep, tf.expand_dims(node_1_mask, 2)) node_2_rep = tf.multiply(node_2_rep, tf.expand_dims(node_2_mask, 2)) self.watch["node_1_rep_first_GCN_masked"] = node_1_rep if self.pred_method == "node_level": entity_1_rep = tf.reshape( tf.nn.embedding_lookup(tf.transpose(node_1_rep, [1, 0, 2]), tf.constant(0)), [-1, node_1_rep_dim]) entity_2_rep = tf.reshape( tf.nn.embedding_lookup(tf.transpose(node_2_rep, [1, 0, 2]), tf.constant(0)), [-1, node_2_rep_dim]) entity_1_2_diff = entity_1_rep - entity_2_rep entity_1_2_sim = entity_1_rep * entity_2_rep aggregation = tf.concat( [entity_1_rep, entity_2_rep, entity_1_2_diff, entity_1_2_sim], axis=1) aggregation_dim = 4 * node_1_rep_dim w_0 = tf.get_variable("w_0", [aggregation_dim, aggregation_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [aggregation_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [aggregation_dim / 2, 2], dtype=tf.float32) b_1 = tf.get_variable("b_1", [2], dtype=tf.float32) # ====== Prediction Layer =============== logits = tf.matmul(aggregation, w_0) + b_0 logits = tf.tanh(logits) logits = tf.matmul(logits, w_1) + b_1 elif self.pred_method == "graph_level": # if the prediction method is graph_level, we perform the graph matching based prediction assert node_1_rep_dim == node_2_rep_dim input_dim = node_1_rep_dim with tf.variable_scope('node_level_matching') as matching_scope: # ========= node level matching =============== (match_reps, match_dim) = match_graph_1_with_graph_2(node_1_rep, node_2_rep, node_1_mask, node_2_mask, input_dim, options=options, watch=self.watch) matching_scope.reuse_variables() node_2_aware_representations.append(match_reps) node_2_aware_dim += match_dim (match_reps, match_dim) = match_graph_1_with_graph_2(node_2_rep, node_1_rep, node_2_mask, node_1_mask, input_dim, options=options, watch=self.watch) node_1_aware_representations.append(match_reps) node_1_aware_dim += match_dim # TODO: add one more MP matching over the graph representation # with tf.variable_scope('context_MP_matching'): # for i in range(options['context_layer_num']): # with tf.variable_scope('layer-{}',format(i)): # [batch_size, single_graph_nodes_size, node_2_aware_dim] node_2_aware_representations = tf.concat( axis=2, values=node_2_aware_representations) # [batch_size, single_graph_nodes_size, node_1_aware_dim] node_1_aware_representations = tf.concat( axis=2, values=node_1_aware_representations) # if self.mode == "train": # node_2_aware_representations = tf.nn.dropout(node_2_aware_representations, (1 - options['dropout_rate'])) # node_1_aware_representations = tf.nn.dropout(node_1_aware_representations, (1 - options['dropout_rate'])) # ========= Highway layer ============== if self.with_match_highway: with tf.variable_scope("left_matching_highway"): node_2_aware_representations = multi_highway_layer( node_2_aware_representations, node_2_aware_dim, options['highway_layer_num']) with tf.variable_scope("right_matching_highway"): node_1_aware_representations = multi_highway_layer( node_1_aware_representations, node_1_aware_dim, options['highway_layer_num']) self.watch["node_1_rep_match"] = node_2_aware_representations # ========= Aggregation Layer ============== aggregation_representation = [] aggregation_dim = 0 node_2_aware_aggregation_input = node_2_aware_representations node_1_aware_aggregation_input = node_1_aware_representations self.watch[ "node_1_rep_match_layer"] = node_2_aware_aggregation_input with tf.variable_scope('aggregation_layer'): # TODO: now we only have 1 aggregation layer; need to change this part if support more aggregation layers # [batch_size, single_graph_nodes_size, node_2_aware_dim] node_2_aware_aggregation_input = tf.multiply( node_2_aware_aggregation_input, tf.expand_dims(node_1_mask, axis=-1)) # [batch_size, single_graph_nodes_size, node_1_aware_dim] node_1_aware_aggregation_input = tf.multiply( node_1_aware_aggregation_input, tf.expand_dims(node_2_mask, axis=-1)) if self.agg_sim_method == "GCN": # [batch_size*single_graph_nodes_size, node_2_aware_dim] node_2_aware_aggregation_input = tf.reshape( node_2_aware_aggregation_input, shape=[-1, node_2_aware_dim]) # [batch_size*single_graph_nodes_size, node_1_aware_dim] node_1_aware_aggregation_input = tf.reshape( node_1_aware_aggregation_input, shape=[-1, node_1_aware_dim]) # [node_1_size, node_2_aware_dim] node_1_rep = tf.concat([ tf.nn.embedding_lookup(node_2_aware_aggregation_input, node_1_looking_table), tf.zeros([1, node_2_aware_dim]) ], 0) # [node_2_size, node_1_aware_dim] node_2_rep = tf.concat([ tf.nn.embedding_lookup(node_1_aware_aggregation_input, node_2_looking_table), tf.zeros([1, node_1_aware_dim]) ], 0) gcn_1_res = self.gcn_encode( self.batch_nodes_first, node_1_rep, self.fw_adj_info_first, self.bw_adj_info_first, input_node_dim=node_2_aware_dim, output_node_dim=self.aggregator_dim_second, fw_aggregators=self.fw_aggregators_second, bw_aggregators=self.bw_aggregators_second, window_size=self.gcn_window_size_second, layer_size=self.gcn_layer_size_second, scope="second_gcn", agg_type=self.agg_type_second, sample_size_per_layer=self.sample_size_per_layer_first, keep_inter_state=self.if_use_multiple_gcn_2_state) max_graph_1_rep = gcn_1_res[1] mean_graph_1_rep = gcn_1_res[2] graph_1_rep_dim = gcn_1_res[3] gcn_2_res = self.gcn_encode( self.batch_nodes_second, node_2_rep, self.fw_adj_info_second, self.bw_adj_info_second, input_node_dim=node_1_aware_dim, output_node_dim=self.aggregator_dim_second, fw_aggregators=self.fw_aggregators_second, bw_aggregators=self.bw_aggregators_second, window_size=self.gcn_window_size_second, layer_size=self.gcn_layer_size_second, scope="second_gcn", agg_type=self.agg_type_second, sample_size_per_layer=self. sample_size_per_layer_second, keep_inter_state=self.if_use_multiple_gcn_2_state) max_graph_2_rep = gcn_2_res[1] mean_graph_2_rep = gcn_2_res[2] graph_2_rep_dim = gcn_2_res[3] assert graph_1_rep_dim == graph_2_rep_dim if self.if_use_multiple_gcn_2_state: graph_1_reps = gcn_1_res[5] graph_2_reps = gcn_2_res[5] inter_dims = gcn_1_res[6] for idx in range(len(graph_1_reps)): (max_graph_1_rep_tmp, mean_graph_1_rep_tmp) = graph_1_reps[idx] (max_graph_2_rep_tmp, mean_graph_2_rep_tmp) = graph_2_reps[idx] inter_dim = inter_dims[idx] aggregation_representation.append( max_graph_1_rep_tmp) aggregation_representation.append( mean_graph_1_rep_tmp) aggregation_representation.append( max_graph_2_rep_tmp) aggregation_representation.append( mean_graph_2_rep_tmp) aggregation_dim += 4 * inter_dim else: aggregation_representation.append(max_graph_1_rep) aggregation_representation.append(mean_graph_1_rep) aggregation_representation.append(max_graph_2_rep) aggregation_representation.append(mean_graph_2_rep) aggregation_dim = 4 * graph_1_rep_dim # aggregation_representation = tf.concat(aggregation_representation, axis=1) gcn_2_window_size = int( len(aggregation_representation) / 4) aggregation_dim = aggregation_dim / gcn_2_window_size w_0 = tf.get_variable( "w_0", [aggregation_dim, aggregation_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [aggregation_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [aggregation_dim / 2, 2], dtype=tf.float32) b_1 = tf.get_variable("b_1", [2], dtype=tf.float32) weights = tf.get_variable("gcn_2_window_weights", [gcn_2_window_size], dtype=tf.float32) # shape: [gcn_2_window_size, batch_size, 2] logits = [] for layer_idx in range(gcn_2_window_size): max_graph_1_rep = aggregation_representation[ layer_idx * 4 + 0] mean_graph_1_rep = aggregation_representation[ layer_idx * 4 + 1] max_graph_2_rep = aggregation_representation[ layer_idx * 4 + 2] mean_graph_2_rep = aggregation_representation[ layer_idx * 4 + 3] aggregation_representation_single = tf.concat([ max_graph_1_rep, mean_graph_1_rep, max_graph_2_rep, mean_graph_2_rep ], axis=1) # ====== Prediction Layer =============== logit = tf.matmul(aggregation_representation_single, w_0) + b_0 logit = tf.tanh(logit) logit = tf.matmul(logit, w_1) + b_1 logits.append(logit) if len(logits) != 1: logits = tf.reshape(tf.concat(logits, axis=0), [gcn_2_window_size, -1, 2]) logits = tf.transpose(logits, [1, 0, 2]) logits = tf.multiply(logits, tf.expand_dims(weights, axis=-1)) logits = tf.reduce_sum(logits, axis=1) else: logits = tf.reshape(logits, [-1, 2]) # ====== Highway layer ============ # if options['with_aggregation_highway']: with tf.name_scope("loss"): self.y_pred = tf.nn.softmax(logits) self.loss = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits( labels=self.y_true, logits=logits, name="xentropy_loss")) / tf.cast( self.batch_size, tf.float32) # ============ Training Objective =========================== if self.mode == "train" and not self.if_pred_on_dev: optimizer = tf.train.AdamOptimizer() params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1) self.training_op = optimizer.apply_gradients( zip(clipped_gradients, params))
def encode(self, is_training=True): options = self.options # ======word representation layer====== in_passage_repres = [] input_dim = 0 if options.with_word and self.word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.variable_scope("embedding"), tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(self.word_vocab.word_vecs), dtype=tf.float32) in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) ## Position encoding # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres)) in_passage_word_repres += positional_encoding( in_passage_word_repres, options.max_answer_len) # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres)[2]) ## Position encoding # [batch_size, passage_len, word_dim] in_passage_repres.append(in_passage_word_repres) # print('in_passage_repres: ', tf.shape(in_passage_repres)) input_shape = tf.shape(self.in_passage_words) batch_size = input_shape[0] passage_len = input_shape[1] input_dim += self.word_vocab.word_dim if options.with_char and self.char_vocab is not None: input_shape = tf.shape(self.in_passage_chars) batch_size = input_shape[0] passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = self.char_vocab.word_dim self.char_embedding = tf.get_variable( "char_embedding", initializer=tf.constant(self.char_vocab.word_vecs), dtype=tf.float32) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell( options.char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - options.dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # passage representation passage_char_outputs = tf.nn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = collect_final_step_lstm( passage_char_outputs, passage_char_lengths - 1) passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, options.char_lstm_dim]) in_passage_repres.append(passage_char_outputs) input_dim += options.char_lstm_dim in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] if options.compress_input: # compress input word vector into smaller vectors w_compress = tf.get_variable( "w_compress_input", [input_dim, options.compress_input_dim], dtype=tf.float32) b_compress = tf.get_variable("b_compress_input", [options.compress_input_dim], dtype=tf.float32) in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim]) in_passage_repres = tf.matmul(in_passage_repres, w_compress) + b_compress in_passage_repres = tf.tanh(in_passage_repres) in_passage_repres = tf.reshape( in_passage_repres, [batch_size, passage_len, options.compress_input_dim]) input_dim = options.compress_input_dim in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) # if is_training: # in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) # else: # in_passage_repres = tf.multiply(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] ## Blocks for i in range(options.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=in_passage_word_repres, keys=in_passage_repres, values=in_passage_repres, num_heads=options.num_heads, dropout_rate=options.dropout_rate, training=is_training, causality=False) # feed forward enc = ff(enc, num_units=[options.d_ff, options.d_model]) ## Blocks memory = enc # sequential context matching passage_forward = None passage_backward = None all_passage_representation = [] passage_dim = 0 with_lstm = True if with_lstm: with tf.variable_scope('biLSTM'): # cur_in_passage_repres = in_passage_repres cur_in_passage_repres = enc for i in xrange(options.context_layer_num): with tf.variable_scope('layer-{}'.format(i)): with tf.variable_scope('context_represent'): # parameters context_lstm_cell_fw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) context_lstm_cell_bw = tf.contrib.rnn.LSTMCell( options.context_lstm_dim) if is_training: context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_fw, output_keep_prob=(1 - options.dropout_rate)) context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper( context_lstm_cell_bw, output_keep_prob=(1 - options.dropout_rate)) # passage representation ((passage_context_representation_fw, passage_context_representation_bw), (passage_forward, passage_backward )) = tf.nn.bidirectional_dynamic_rnn( context_lstm_cell_fw, context_lstm_cell_bw, cur_in_passage_repres, dtype=tf.float32, sequence_length=self.passage_lengths ) # [batch_size, passage_len, context_lstm_dim] if options.direction == 'forward': # [batch_size, passage_len, context_lstm_dim] cur_in_passage_repres = passage_context_representation_fw passage_dim += options.context_lstm_dim elif options.direction == 'backward': # [batch_size, passage_len, context_lstm_dim] cur_in_passage_repres = passage_context_representation_bw passage_dim += options.context_lstm_dim elif options.direction == 'bidir': # [batch_size, passage_len, 2*context_lstm_dim] cur_in_passage_repres = tf.concat([ passage_context_representation_fw, passage_context_representation_bw ], 2) passage_dim += 2 * options.context_lstm_dim else: assert False all_passage_representation.append( cur_in_passage_repres) all_passage_representation = tf.concat( all_passage_representation, 2) # [batch_size, passage_len, passage_dim] if is_training: all_passage_representation = tf.nn.dropout( all_passage_representation, (1 - options.dropout_rate)) else: all_passage_representation = tf.multiply( all_passage_representation, (1 - options.dropout_rate)) # ======Highway layer====== if options.with_match_highway: with tf.variable_scope("context_highway"): all_passage_representation = match_utils.multi_highway_layer( all_passage_representation, passage_dim, options.highway_layer_num) all_passage_representation = all_passage_representation * tf.expand_dims( passage_mask, axis=-1) # initial state for the LSTM decoder #''' with tf.variable_scope('initial_state_for_decoder'): # Define weights and biases to reduce the cell and reduce the state w_reduce_c = tf.get_variable( 'w_reduce_c', [2 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) w_reduce_h = tf.get_variable( 'w_reduce_h', [2 * options.context_lstm_dim, options.gen_hidden_size], dtype=tf.float32) bias_reduce_c = tf.get_variable('bias_reduce_c', [options.gen_hidden_size], dtype=tf.float32) bias_reduce_h = tf.get_variable('bias_reduce_h', [options.gen_hidden_size], dtype=tf.float32) old_c = tf.concat(values=[passage_forward.c, passage_backward.c], axis=1) old_h = tf.concat(values=[passage_forward.h, passage_backward.h], axis=1) new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c) new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h) init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) ''' new_c = tf.zeros([batch_size, options.gen_hidden_size]) new_h = tf.zeros([batch_size, options.gen_hidden_size]) init_state = LSTMStateTuple(new_c, new_h) ''' return (passage_dim, all_passage_representation, init_state, memory)
def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam',lambda_l2=1e-5, with_word=True, with_char=True, with_POS=True, with_NER=True, char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True,filter_layer_threshold=0.2, MP_dim=50, context_layer_num=1,aggregation_layer_num=1, fix_word_vec=False,with_filter_layer=True, with_highway=False, with_lex_features=False,lex_dim=100,word_level_MP_dim=-1,sep_endpoint=False,end_model_combine=False,with_match_highway=False, with_aggregation_highway=False,highway_layer_num=1,with_lex_decomposition=False, lex_decompsition_dim=-1, with_left_match=True, with_right_match=True, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True): # ======word representation layer====== in_question_repres = [] in_passage_repres = [] self.question_lengths = tf.placeholder(tf.int32, [None]) self.passage_lengths = tf.placeholder(tf.int32, [None]) self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] input_dim = 0 if with_word and word_vocab is not None: self.in_question_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] # self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if with_POS and POS_vocab is not None: self.in_question_POSs = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_POSs = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] # self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup(self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup(self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += POS_vocab.word_dim if with_NER and NER_vocab is not None: self.in_question_NERs = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_NERs = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len] # self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup(self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup(self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += NER_vocab.word_dim if with_char and char_vocab is not None: self.question_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, question_len] self.passage_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, passage_len] self.in_question_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] self.in_passage_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim # self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(char_lstm_cell, output_keep_prob=(1 - dropout_rate)) char_lstm_cell = tf.nn.rnn_cell.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = my_rnn.dynamic_rnn(char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths,dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:,-1,:] question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = my_rnn.dynamic_rnn(char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths,dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:,-1,:] passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += char_lstm_dim in_question_repres = tf.concat(2, in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat(2, in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) else: in_question_repres = tf.mul(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.mul(in_passage_repres, (1 - dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func2(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, MP_dim, input_dim, with_filter_layer, context_layer_num, context_lstm_dim,is_training,dropout_rate, with_match_highway,aggregation_layer_num, aggregation_lstm_dim,highway_layer_num, with_aggregation_highway,with_lex_decomposition,lex_decompsition_dim, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, with_left_match, with_right_match) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.mul(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif optimize_type == 'sgd': self.global_step = tf.Variable(0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum(min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer(learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, num_classes, word_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam', lambda_l2=1e-5, with_word=True, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True, MP_dim=50, context_layer_num=1, aggregation_layer_num=1, fix_word_vec=True, with_filter_layer=True, with_highway=True, with_match_highway=False, with_aggregation_highway=False, highway_layer_num=1, with_lex_decomposition=False, lex_decompsition_dim=-1, with_left_match=True, with_right_match=True, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True): with tf.name_scope("Train" if is_training else "Test"): # ======word representation layer====== in_question_repres = [] in_passage_repres = [] self.question_lengths = tf.placeholder(tf.int32, [None], name="question_lengths") # [batch_size]:[2,2,3,...,10] self.passage_lengths = tf.placeholder(tf.int32, [None], name="passage_lengths") self.truth = tf.placeholder(tf.int32, [None], name="truth") # [batch_size] print ("self.truth.name: ", self.truth.name) input_dim = 0 if with_word and word_vocab is not None: self.in_question_words = tf.placeholder(tf.int32, [None, None], name="question_words") # [batch_size, question_len] self.in_passage_words = tf.placeholder(tf.int32, [None, None], name="passage_words") # [batch_size, passage_len] print ("self.in_passage_words.name: ", self.in_passage_words.name) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): wordInitial = tf.constant(word_vocab.word_vecs) self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=wordInitial, dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) # [1, batch_size, question_len, word_dim] in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) # [batch_size, question_len] batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) # [batch_size, question_len] passage_len = input_shape[1] input_dim += len(word_vocab.word_vecs[0]) print("input_dim:", input_dim) self.in_ques = in_question_repres self.in_question_repres = in_question_repres = tf.concat(in_question_repres, 2) # [batch_size, question_len, dim] in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) else: in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate)) '''补充0到passage_len长度 [[1. 1. 1. 1. 1. 1. 0. 0.] [1. 1. 1. 1. 1. 1. 0. 0.] [1. 1. 1. 1. 1. 1. 0. 0.]] ''' mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len], question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func2(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, MP_dim, input_dim, with_filter_layer, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_lex_decomposition, lex_decompsition_dim, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, with_left_match, with_right_match) # ========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.multiply(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits, name='prob') print "prob: ", self.prob.name gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif optimize_type == 'sgd': self.global_step = tf.Variable(0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum(min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer(learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab=None, edge_label_vocab=None, char_vocab=None, is_training=True, options=None): assert options != None self.passage_nodes_size = tf.placeholder(tf.int32, [None]) # [batch_size] self.passage_nodes = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_nodes_size_max] if options.with_char: self.passage_nodes_chars_size = tf.placeholder( tf.int32, [None, None]) self.passage_nodes_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_nodes_size_max, passage_neighbors_size_max] self.passage_in_neighbor_indices = tf.placeholder( tf.int32, [None, None, None]) self.passage_in_neighbor_edges = tf.placeholder( tf.int32, [None, None, None]) self.passage_in_neighbor_mask = tf.placeholder(tf.float32, [None, None, None]) # [batch_size, passage_nodes_size_max, passage_neighbors_size_max] self.passage_out_neighbor_indices = tf.placeholder( tf.int32, [None, None, None]) self.passage_out_neighbor_edges = tf.placeholder( tf.int32, [None, None, None]) self.passage_out_neighbor_mask = tf.placeholder( tf.float32, [None, None, None]) # shapes input_shape = tf.shape(self.passage_in_neighbor_indices) batch_size = input_shape[0] passage_nodes_size_max = input_shape[1] passage_in_neighbors_size_max = input_shape[2] passage_out_neighbors_size_max = tf.shape( self.passage_out_neighbor_indices)[2] if options.with_char: passage_nodes_chars_size_max = tf.shape( self.passage_nodes_chars)[2] # masks # [batch_size, passage_nodes_size_max] self.passage_nodes_mask = tf.sequence_mask(self.passage_nodes_size, passage_nodes_size_max, dtype=tf.float32) # embeddings if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' else: word_vec_trainable = True cur_device = '/gpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant( word_vocab.word_vecs), dtype=tf.float32) self.edge_embedding = tf.get_variable("edge_embedding", initializer=tf.constant( edge_label_vocab.word_vecs), dtype=tf.float32) word_dim = word_vocab.word_dim edge_dim = edge_label_vocab.word_dim if options.with_char: self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) char_dim = char_vocab.word_dim # word representation for nodes, where each node only includes one word # [batch_size, passage_nodes_size_max, word_dim] passage_node_representation = tf.nn.embedding_lookup( self.word_embedding, self.passage_nodes) if options.with_char: # [batch_size, passage_nodes_size_max, passage_nodes_chars_size_max, char_dim] passage_nodes_chars_representation = tf.nn.embedding_lookup( self.char_embedding, self.passage_nodes_chars) passage_nodes_chars_representation = tf.reshape( passage_nodes_chars_representation, shape=[ batch_size * passage_nodes_size_max, passage_nodes_chars_size_max, char_dim ]) passage_nodes_chars_size = tf.reshape( self.passage_nodes_chars_size, [batch_size * passage_nodes_size_max]) with tf.variable_scope('node_char_lstm'): node_char_lstm_cell = tf.contrib.rnn.LSTMCell( options.char_lstm_dim) node_char_lstm_cell = tf.contrib.rnn.MultiRNNCell( [node_char_lstm_cell]) # [batch_size*node_num, char_num, char_lstm_dim] node_char_outputs = tf.nn.dynamic_rnn( node_char_lstm_cell, passage_nodes_chars_representation, sequence_length=passage_nodes_chars_size, dtype=tf.float32)[0] node_char_outputs = collect_final_step_lstm( node_char_outputs, passage_nodes_chars_size - 1) # [batch_size, node_num, char_lstm_dim] node_char_outputs = tf.reshape(node_char_outputs, [ batch_size, passage_nodes_size_max, options.char_lstm_dim ]) if options.with_char: input_dim = word_dim + options.char_lstm_dim passage_node_representation = tf.concat( [passage_node_representation, node_char_outputs], 2) else: input_dim = word_dim passage_node_representation = passage_node_representation # apply the mask passage_node_representation = passage_node_representation * tf.expand_dims( self.passage_nodes_mask, axis=-1) if options.compress_input: # compress input word vector into smaller vectors w_compress = tf.get_variable( "w_compress_input", [input_dim, options.compress_input_dim], dtype=tf.float32) b_compress = tf.get_variable("b_compress_input", [options.compress_input_dim], dtype=tf.float32) passage_node_representation = tf.reshape( passage_node_representation, [-1, input_dim]) passage_node_representation = tf.matmul( passage_node_representation, w_compress) + b_compress passage_node_representation = tf.tanh(passage_node_representation) passage_node_representation = tf.reshape(passage_node_representation, \ [batch_size, passage_nodes_size_max, options.compress_input_dim]) input_dim = options.compress_input_dim if is_training: passage_node_representation = tf.nn.dropout( passage_node_representation, (1 - options.dropout_rate)) # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): passage_node_representation = match_utils.multi_highway_layer( passage_node_representation, input_dim, options.highway_layer_num) self.input_dim = input_dim with tf.variable_scope('graph_encoder'): # =========== in neighbor # [batch_size, passage_len, passage_neighbors_size_max, edge_dim] passage_in_neighbor_edge_representations = tf.nn.embedding_lookup( self.edge_embedding, self.passage_in_neighbor_edges) # [batch_size, passage_len, passage_neighbors_size_max, node_dim] passage_in_neighbor_node_representations = collect_neighbor_node_representations( passage_node_representation, self.passage_in_neighbor_indices) passage_in_neighbor_representations = tf.concat( \ [passage_in_neighbor_node_representations, passage_in_neighbor_edge_representations], 3) passage_in_neighbor_representations = tf.multiply( passage_in_neighbor_representations, tf.expand_dims(self.passage_in_neighbor_mask, axis=-1)) # [batch_size, passage_len, node_dim + edge_dim] passage_in_neighbor_representations = tf.reduce_sum( passage_in_neighbor_representations, axis=2) # ============ out neighbor # [batch_size, passage_len, passage_neighbors_size_max, edge_dim] passage_out_neighbor_edge_representations = tf.nn.embedding_lookup( self.edge_embedding, self.passage_out_neighbor_edges) # [batch_size, passage_len, passage_neighbors_size_max, node_dim] passage_out_neighbor_node_representations = collect_neighbor_node_representations( passage_node_representation, self.passage_out_neighbor_indices) passage_out_neighbor_representations = tf.concat( \ [passage_out_neighbor_node_representations, passage_out_neighbor_edge_representations], 3) passage_out_neighbor_representations = tf.multiply( passage_out_neighbor_representations, tf.expand_dims(self.passage_out_neighbor_mask, axis=-1)) # [batch_size, passage_len, node_dim + edge_dim] passage_out_neighbor_representations = tf.reduce_sum( passage_out_neighbor_representations, axis=2) # =====transpose neighbor_representations grn_hidden_dim = options.neighbor_vector_dim w_trans = tf.get_variable("w_trans", [input_dim + edge_dim, grn_hidden_dim], dtype=tf.float32) b_trans = tf.get_variable("b_trans", [grn_hidden_dim], dtype=tf.float32) passage_in_neighbor_representations = tf.reshape( passage_in_neighbor_representations, [-1, input_dim + edge_dim]) passage_in_neighbor_representations = tf.matmul( passage_in_neighbor_representations, w_trans) + b_trans passage_in_neighbor_representations = tf.tanh( passage_in_neighbor_representations) passage_out_neighbor_representations = tf.reshape( passage_out_neighbor_representations, [-1, input_dim + edge_dim]) passage_out_neighbor_representations = tf.matmul( passage_out_neighbor_representations, w_trans) + b_trans passage_out_neighbor_representations = tf.tanh( passage_out_neighbor_representations) # assume each node has a neighbor vector, and it is None at the beginning passage_node_hidden = tf.zeros( [batch_size, passage_nodes_size_max, grn_hidden_dim]) passage_node_cell = tf.zeros( [batch_size, passage_nodes_size_max, grn_hidden_dim]) w_in_ingate = tf.get_variable("w_in_ingate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_in_ingate = tf.get_variable("u_in_ingate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) b_ingate = tf.get_variable("b_in_ingate", [grn_hidden_dim], dtype=tf.float32) w_out_ingate = tf.get_variable("w_out_ingate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_out_ingate = tf.get_variable("u_out_ingate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) w_in_forgetgate = tf.get_variable("w_in_forgetgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_in_forgetgate = tf.get_variable("u_in_forgetgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) b_forgetgate = tf.get_variable("b_in_forgetgate", [grn_hidden_dim], dtype=tf.float32) w_out_forgetgate = tf.get_variable( "w_out_forgetgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_out_forgetgate = tf.get_variable( "u_out_forgetgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) w_in_outgate = tf.get_variable("w_in_outgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_in_outgate = tf.get_variable("u_in_outgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) b_outgate = tf.get_variable("b_in_outgate", [grn_hidden_dim], dtype=tf.float32) w_out_outgate = tf.get_variable("w_out_outgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_out_outgate = tf.get_variable("u_out_outgate", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) w_in_cell = tf.get_variable("w_in_cell", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_in_cell = tf.get_variable("u_in_cell", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) b_cell = tf.get_variable("b_in_cell", [grn_hidden_dim], dtype=tf.float32) w_out_cell = tf.get_variable("w_out_cell", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) u_out_cell = tf.get_variable("u_out_cell", [grn_hidden_dim, grn_hidden_dim], dtype=tf.float32) # calculate question graph representation graph_representations = [] for i in xrange(options.num_syntax_match_layer): # =============== in edge hidden # h_{ij} [batch_size, node_len, neighbors_size, neighbor_vector_dim] passage_in_edge_prev_hidden = collect_neighbor_node_representations( passage_node_hidden, self.passage_in_neighbor_indices) passage_in_edge_prev_hidden = tf.multiply( passage_in_edge_prev_hidden, tf.expand_dims(self.passage_in_neighbor_mask, axis=-1)) # [batch_size, node_len, neighbor_vector_dim] passage_in_edge_prev_hidden = tf.reduce_sum( passage_in_edge_prev_hidden, axis=2) passage_in_edge_prev_hidden = tf.multiply( passage_in_edge_prev_hidden, tf.expand_dims(self.passage_nodes_mask, axis=-1)) passage_in_edge_prev_hidden = tf.reshape( passage_in_edge_prev_hidden, [-1, grn_hidden_dim]) # =============== out edge hidden # h_{jk} [batch_size, node_len, neighbors_size, neighbor_vector_dim] passage_out_edge_prev_hidden = collect_neighbor_node_representations( passage_node_hidden, self.passage_out_neighbor_indices) passage_out_edge_prev_hidden = tf.multiply( passage_out_edge_prev_hidden, tf.expand_dims(self.passage_out_neighbor_mask, axis=-1)) # [batch_size, node_len, neighbor_vector_dim] passage_out_edge_prev_hidden = tf.reduce_sum( passage_out_edge_prev_hidden, axis=2) passage_out_edge_prev_hidden = tf.multiply( passage_out_edge_prev_hidden, tf.expand_dims(self.passage_nodes_mask, axis=-1)) passage_out_edge_prev_hidden = tf.reshape( passage_out_edge_prev_hidden, [-1, grn_hidden_dim]) ## ig passage_edge_ingate = tf.sigmoid( tf.matmul(passage_in_neighbor_representations, w_in_ingate) + tf.matmul(passage_in_edge_prev_hidden, u_in_ingate) + tf.matmul(passage_out_neighbor_representations, w_out_ingate) + tf.matmul(passage_out_edge_prev_hidden, u_out_ingate) + b_ingate) passage_edge_ingate = tf.reshape( passage_edge_ingate, [batch_size, passage_nodes_size_max, grn_hidden_dim]) ## fg passage_edge_forgetgate = tf.sigmoid( tf.matmul(passage_in_neighbor_representations, w_in_forgetgate) + tf.matmul(passage_in_edge_prev_hidden, u_in_forgetgate) + tf.matmul(passage_out_neighbor_representations, w_out_forgetgate) + tf.matmul(passage_out_edge_prev_hidden, u_out_forgetgate) + b_forgetgate) passage_edge_forgetgate = tf.reshape( passage_edge_forgetgate, [batch_size, passage_nodes_size_max, grn_hidden_dim]) ## og passage_edge_outgate = tf.sigmoid( tf.matmul(passage_in_neighbor_representations, w_in_outgate) + tf.matmul(passage_in_edge_prev_hidden, u_in_outgate) + tf.matmul(passage_out_neighbor_representations, w_out_outgate) + tf.matmul(passage_out_edge_prev_hidden, u_out_outgate) + b_outgate) passage_edge_outgate = tf.reshape( passage_edge_outgate, [batch_size, passage_nodes_size_max, grn_hidden_dim]) ## input passage_edge_cell_input = tf.tanh( tf.matmul(passage_in_neighbor_representations, w_in_cell) + tf.matmul(passage_in_edge_prev_hidden, u_in_cell) + tf.matmul(passage_out_neighbor_representations, w_out_cell) + tf.matmul(passage_out_edge_prev_hidden, u_out_cell) + b_cell) passage_edge_cell_input = tf.reshape( passage_edge_cell_input, [batch_size, passage_nodes_size_max, grn_hidden_dim]) passage_edge_cell = passage_edge_forgetgate * passage_node_cell + passage_edge_ingate * passage_edge_cell_input passage_edge_hidden = passage_edge_outgate * tf.tanh( passage_edge_cell) # node mask # [batch_size, passage_len, neighbor_vector_dim] passage_node_cell = tf.multiply( passage_edge_cell, tf.expand_dims(self.passage_nodes_mask, axis=-1)) passage_node_hidden = tf.multiply( passage_edge_hidden, tf.expand_dims(self.passage_nodes_mask, axis=-1)) graph_representations.append(passage_node_hidden) # decide how to use graph_representations self.graph_representations = graph_representations self.node_representations = passage_node_representation self.graph_hiddens = passage_node_hidden self.graph_cells = passage_node_cell self.batch_size = batch_size
def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam', lambda_l2=1e-5, with_word=True, with_char=True, with_POS=True, with_NER=True, char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True, filter_layer_threshold=0.2, MP_dim=50, context_layer_num=1, aggregation_layer_num=1, fix_word_vec=False, with_filter_layer=True, with_highway=False, word_level_MP_dim=-1, sep_endpoint=False, end_model_combine=False, with_match_highway=False, with_aggregation_highway=False, highway_layer_num=1, match_to_passage=True, match_to_question=False, match_to_choice=False, with_no_match=False, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, use_options=False, num_options=-1, verbose=False, matching_option=0, concat_context=False, tied_aggre=False, rl_training_method='contrastive', rl_matches=[0, 1, 2]): ''' Matching Options: 0:a1=q->p, a2=c->p, [concat(a1->a2,a2->a1)] 1:a1=q->p, a2=c->p, [a1->a2,a2->a1] 2:[q->p,c->p] 3:a1=p->q, a2=p->c, [a1->a2,a2->a1] 4:[q->p,p->q,p->c] 5:a1=q->p, a2=p->q, a3=p->c,[a3->a1,a3->a2] 6:[p->q,p->c] 7: Gated matching concat_context: Concat question & choice and feed into context LSTM tied_aggre: aggregation layer weights are tied. training_method: contrastive reward or policy gradient or soft voting ''' # ======word representation layer====== in_question_repres = [] in_passage_repres = [] in_choice_repres = [] self.question_lengths = tf.placeholder(tf.int32, [None]) self.passage_lengths = tf.placeholder(tf.int32, [None]) self.choice_lengths = tf.placeholder(tf.int32, [None]) self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] self.concat_idx_mat = None self.split_idx_mat_q = None self.split_idx_mat_c = None if matching_option == 7: self.concat_idx_mat = tf.placeholder(tf.int32, [None, None, 2]) if concat_context: self.split_idx_mat_q = tf.placeholder(tf.int32, [None, None, 2]) self.split_idx_mat_c = tf.placeholder(tf.int32, [None, None, 2]) input_dim = 0 if with_word and word_vocab is not None: self.in_question_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_choice_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' print('!!!shape=', word_vocab.word_vecs.shape) with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_choice_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_choice_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) in_choice_repres.append(in_choice_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_shape = tf.shape(self.in_choice_words) choice_len = input_shape[1] input_dim += word_vocab.word_dim if with_POS and POS_vocab is not None: self.in_question_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += POS_vocab.word_dim if with_NER and NER_vocab is not None: self.in_question_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] # self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += NER_vocab.word_dim if with_char and char_vocab is not None: self.question_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.passage_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.choice_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_question_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] self.in_passage_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] self.in_choice_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] input_shape = tf.shape(self.in_choice_chars) choice_len = input_shape[1] c_char_len = input_shape[2] char_dim = char_vocab.word_dim # self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) in_choice_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_choice_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_choice_char_repres = tf.reshape( in_choice_char_repres, shape=[-1, c_char_len, char_dim]) choice_char_lengths = tf.reshape(self.choice_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:, -1, :] question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:, -1, :] passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # choice representation choice_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_choice_char_repres, sequence_length=choice_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] choice_char_outputs = choice_char_outputs[:, -1, :] choice_char_outputs = tf.reshape( choice_char_outputs, [batch_size, choice_len, char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) in_choice_repres.append(choice_char_outputs) input_dim += char_lstm_dim in_question_repres = tf.concat(in_question_repres, 2) # [batch_size, question_len, dim] in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] in_choice_repres = tf.concat(in_choice_repres, 2) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) in_choice_repres = tf.nn.dropout(in_choice_repres, (1 - dropout_rate)) else: in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate)) in_choice_repres = tf.multiply(in_choice_repres, (1 - dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] choice_mask = tf.sequence_mask( self.choice_lengths, choice_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_choice_repres = match_utils.multi_highway_layer( in_choice_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== if verbose: if matching_option == 7: (all_match_templates, match_dim, gate_input, self.matching_vectors) = gated_trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, self.concat_idx_mat, self.split_idx_mat_q, self.split_idx_mat_c, MP_dim, input_dim, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, concat_context, tied_aggre, rl_matches, debug=True) else: (match_representation, match_dim, self.matching_vectors) = match_utils.trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, MP_dim, input_dim, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, match_to_passage, match_to_question, match_to_choice, with_no_match, debug=True, matching_option=matching_option) else: if matching_option == 7: (all_match_templates, match_dim, gate_input) = gated_trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, self.concat_idx_mat, self.split_idx_mat_q, self.split_idx_mat_c, MP_dim, input_dim, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, concat_context, tied_aggre, rl_matches) else: (match_representation, match_dim) = match_utils.trilateral_match( in_question_repres, in_passage_repres, in_choice_repres, self.question_lengths, self.passage_lengths, self.choice_lengths, question_mask, mask, choice_mask, MP_dim, input_dim, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, match_to_passage, match_to_question, match_to_choice, with_no_match, matching_option=matching_option) if matching_option == 7: with tf.variable_scope('rl_decision_gate'): if use_options: gate_input = gate_input[::num_options, :] w_gate = tf.get_variable( 'w_gate', [2 * context_lstm_dim, len(rl_matches)], dtype=tf.float32) b_gate = tf.get_variable('b_gate', [len(rl_matches)], dtype=tf.float32) gate_logits = tf.matmul(gate_input, w_gate) + b_gate gate_prob = tf.nn.softmax(gate_logits) gate_log_prob = tf.nn.log_softmax(gate_logits) print('check: match_dim=', match_dim) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) if use_options: w_1 = tf.get_variable("w_1", [match_dim / 2, 1], dtype=tf.float32) b_1 = tf.get_variable("b_1", [1], dtype=tf.float32) else: w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) if matching_option == 7: sliced_gate_probs = tf.split(gate_prob, len(rl_matches), axis=1) sliced_gate_log_probs = tf.split(gate_log_prob, len(rl_matches), axis=1) # if use_options: # tile_times=tf.constant([1,num_options]) # else: # tile_times=tf.constant([1,num_classes]) weighted_probs = [] weighted_log_probs = [] for mid, matcher in enumerate(all_match_templates): matcher.add_softmax_pred(w_0, b_0, w_1, b_1, is_training, dropout_rate, use_options, num_options) weighted_probs.append( tf.multiply(matcher.prob, sliced_gate_probs[mid])) weighted_log_probs.append( tf.add(matcher.log_prob, sliced_gate_log_probs[mid])) if verbose: self.all_probs = tf.stack(weighted_probs, axis=0) self.prob = tf.add_n(weighted_probs) if use_options: gold_matrix = tf.reshape(self.truth, [-1, num_options]) gold_matrix = tf.cast(gold_matrix, tf.float32) correct = tf.equal(tf.argmax(self.prob, 1), tf.argmax(gold_matrix, 1)) else: gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) correct = tf.nn.in_top_k(logits, self.truth, 1) self.correct = correct self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if rl_training_method == 'soft_voting': stacked_log_prob = tf.stack(weighted_log_probs, axis=2) self.log_prob = tf.reduce_logsumexp(stacked_log_prob, axis=2) self.loss = tf.reduce_mean( tf.multiply(gold_matrix, self.log_prob)) elif rl_training_method == 'contrastive': weighted_log_probs = tf.stack(weighted_log_probs, axis=0) weighted_probs = tf.stack(weighted_probs, axis=0) reward_matrix = gold_matrix baseline = tf.reduce_sum(tf.multiply(weighted_probs, reward_matrix), axis=[0, 2], keep_dims=True) log_coeffs = tf.multiply(weighted_probs, reward_matrix - baseline) log_coeffs = tf.stop_gradient(log_coeffs) self.loss = tf.negative( tf.reduce_sum(tf.multiply(weighted_log_probs, log_coeffs))) else: logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.multiply(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.final_logits = logits if use_options: logits = tf.reshape(logits, [-1, num_options]) gold_matrix = tf.reshape(self.truth, [-1, num_options]) self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') # gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=gold_matrix)) # correct = tf.nn.in_top_k(logits, self.truth, 1) # self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) correct = tf.equal(tf.argmax(logits, 1), tf.argmax(gold_matrix, 1)) self.correct = correct else: self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) # gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.correct = correct self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(list(zip(grads, tvars))) elif optimize_type == 'sgd': self.global_step = tf.Variable( 0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum( min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer( learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(list(zip(grads, tvars))) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== in_question_repres = [] # word and char in_passage_repres = [] # word and char input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim, input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1) question_char_outputs_bw = question_char_outputs_bw[:, 0, :] question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim]) (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim, input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1) passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :] passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw]) passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += 2*options.char_lstm_dim in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def _build(self, in_passage_words, passage_lengths, in_question_words_soft, question_lengths, truth): """ truth: a int in [0 .. num_classes] indicating entailment """ num_classes = self.num_classes word_vocab = self.word_vocab is_training = self.is_training global_step = self.global_step options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) #in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, in_question_words_soft) # [batch_size, question_len, word_dim] in_question_word_repres = tx.utils.soft_sequence_embedding( self.word_embedding, in_question_words_soft) in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(in_question_words_soft) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if is_training: tvars = tf.trainable_variables() if self.options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops) return { "logits": logits, "prob": self.prob, "loss": self.loss, "correct": correct, "eval_correct": self.eval_correct, "predictions": self.predictions, }