def build_network(self): self.options = self.config["options"] self.options["batch_size"] = self.batch_size self.highway_layer_num = self.options["highway_layer_num"] self.with_highway = self.options["with_highway"] self.wd = self.config.get("weight_decay", None) self.l2_reg = float(self.config["l2_reg"]) in_question_repres = tf.nn.dropout(self.s1_emb, self.dropout_keep_prob) in_passage_repres = tf.nn.dropout(self.s2_emb, self.dropout_keep_prob) input_dim = self.emb_size # ======Highway layer====== if self.with_highway: with tf.variable_scope(self.scope+"-input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, self.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, self.highway_layer_num) # ========Bilateral Matching===== with tf.variable_scope(self.scope+"-bilateral_matching"): (match_representation, match_dim) = match_utils.bilateral_match_func( in_question_repres, in_passage_repres, self.sent1_token_len, self.sent2_token_len, self.sent1_token_mask, self.sent2_token_mask, input_dim, self.config["mode"], options=self.options, dropout_rate=self.dropout_keep_prob) self.output_features = match_representation #========Prediction Layer========= with tf.variable_scope(self.scope+"-prediction"): # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, self.num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [self.num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) logits = tf.nn.dropout(logits, (self.dropout_keep_prob)) self.estimation = tf.matmul(logits, w_1) + b_1 self.pred_probs = tf.contrib.layers.softmax(self.estimation) self.logits = tf.cast(tf.argmax(self.pred_probs, -1), tf.int32) match_utils.add_reg_without_bias(self.scope)
def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== in_question_repres = [] # word and char in_passage_repres = [] # word and char input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim, input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1) question_char_outputs_bw = question_char_outputs_bw[:, 0, :] question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim]) (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim, input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1) passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :] passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw]) passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += 2*options.char_lstm_dim in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, dropout_rate=0.5, learning_rate=0.001, optimize_type='adam', lambda_l2=1e-5, with_word=True, with_char=True, with_POS=True, with_NER=True, char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True, filter_layer_threshold=0.2, MP_dim=50, context_layer_num=1, aggregation_layer_num=1, fix_word_vec=False, with_filter_layer=True, with_highway=False, with_lex_features=False, lex_dim=100, word_level_MP_dim=-1, sep_endpoint=False, end_model_combine=False, with_match_highway=False, with_aggregation_highway=False, highway_layer_num=1, with_lex_decomposition=False, lex_decompsition_dim=-1, with_left_match=True, with_right_match=True, with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True, with_dep=True, with_image=False, image_with_hypothesis_only=False, with_img_full_match=True, with_img_maxpool_match=False, with_img_attentive_match=True, with_img_max_attentive_match=True, image_context_layer=True, img_dim=100): # ======word representation layer====== in_question_repres = [] # premise in_question_dep_cons = [] # premise dependency connections in_passage_repres = [] # hypothesis in_passage_dep_cons = [] # hypothesis dependency connections self.question_lengths = tf.placeholder(tf.int32, [None]) self.passage_lengths = tf.placeholder(tf.int32, [None]) self.truth = tf.placeholder(tf.int32, [None]) # [batch_size] input_dim = 0 # word embedding if with_word and word_vocab is not None: self.in_question_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_words = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #self.emb_init = tf.placeholder(tf.float32, shape=word_vocab.word_vecs.shape) #self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) word_vec_trainable = True cur_device = '/gpu:0' if fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) #self.word_embedding = tf.Variable(self.emb_init, name="word_embedding",trainable=word_vec_trainable, dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] #print (in_question_word_repres) in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if with_dep: self.in_question_dependency = tf.placeholder( tf.float32, [None, None, word_vocab.parser.typesize ]) # [batch_size, question_len, dep_dim] self.in_passage_dependency = tf.placeholder( tf.float32, [None, None, word_vocab.parser.typesize ]) # [batch_size, passage_len, dep_dim] self.in_question_dep_con = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_dep_con = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #dependency representation is the same as data input in_question_dep_repres = self.in_question_dependency in_passage_dep_repres = self.in_passage_dependency #in_question_repres.append(in_question_dep_repres) #in_passage_repres.append(in_passage_dep_repres) #input_dim += word_vocab.parser.typesize # dependency_dim # embedding dependency later here with tf.variable_scope('dep_lstm'): # lstm cell dep_lstm_cell = tf.contrib.rnn.BasicLSTMCell(20) # dropout if is_training: dep_lstm_cell = tf.contrib.rnn.DropoutWrapper( dep_lstm_cell, output_keep_prob=(1 - dropout_rate)) dep_lstm_cell = tf.contrib.rnn.MultiRNNCell([dep_lstm_cell]) # question_representation question_dep_outputs = my_rnn.dynamic_rnn( dep_lstm_cell, in_question_dep_repres, sequence_length=self.question_lengths, dtype=tf.float32)[0] # [batch_size, question_len, 20] #question_dep_outputs = question_dep_outputs[:,-1,:] #question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_dep_outputs = my_rnn.dynamic_rnn( dep_lstm_cell, in_passage_dep_repres, sequence_length=self.passage_lengths, dtype=tf.float32)[0] # [batch_size, q_char_len, 20] #passage_char_outputs = passage_char_outputs[:,-1,:] #passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, char_lstm_dim]) in_question_repres.append(question_dep_outputs) in_passage_repres.append(passage_dep_outputs) input_dim += 20 #get dependency connections, do smth here? otherwise just pass self.in_question_dep_con to matching function in_question_dep_cons = self.in_question_dep_con in_passage_dep_cons = self.in_passage_dep_con out_image_feats = None if with_image: self.image_feats = tf.placeholder( tf.float32, [None, 49, 512]) #[batch_size, in_feats_dim] image_feats = tf.reshape(self.image_feats, [-1, 512]) #now resize it to context_lstm_dim w_0 = tf.get_variable("image_w_0", [512, context_lstm_dim], dtype=tf.float32) b_0 = tf.get_variable("image_b_0", [context_lstm_dim], dtype=tf.float32) out_image_feats = tf.matmul(image_feats, w_0) + b_0 # [batch_size, 300] if is_training: out_image_feats = tf.nn.dropout(out_image_feats, (1 - dropout_rate)) else: out_image_feats = tf.multiply(out_image_feats, (1 - dropout_rate)) out_image_feats = tf.reshape(out_image_feats, [-1, 49, context_lstm_dim]) if with_POS and POS_vocab is not None: self.in_question_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_POSs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32) self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant( POS_vocab.word_vecs), dtype=tf.float32) in_question_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim] in_passage_POS_repres = tf.nn.embedding_lookup( self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim] in_question_repres.append(in_question_POS_repres) in_passage_repres.append(in_passage_POS_repres) input_shape = tf.shape(self.in_question_POSs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_POSs) passage_len = input_shape[1] input_dim += POS_vocab.word_dim if with_NER and NER_vocab is not None: self.in_question_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.in_passage_NERs = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] #self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32) self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant( NER_vocab.word_vecs), dtype=tf.float32) in_question_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim] in_passage_NER_repres = tf.nn.embedding_lookup( self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim] in_question_repres.append(in_question_NER_repres) in_passage_repres.append(in_passage_NER_repres) input_shape = tf.shape(self.in_question_NERs) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_NERs) passage_len = input_shape[1] input_dim += NER_vocab.word_dim if with_char and char_vocab is not None: self.question_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, question_len] self.passage_char_lengths = tf.placeholder( tf.int32, [None, None]) # [batch_size, passage_len] self.in_question_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len] self.in_passage_chars = tf.placeholder( tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len] input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim # self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant( char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) with tf.variable_scope('char_lstm'): # lstm cell char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(char_lstm_dim) # dropout if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper( char_lstm_cell, output_keep_prob=(1 - dropout_rate)) char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell]) # question_representation question_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_question_char_repres, sequence_length=question_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] question_char_outputs = question_char_outputs[:, -1, :] question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, char_lstm_dim]) tf.get_variable_scope().reuse_variables() # passage representation passage_char_outputs = my_rnn.dynamic_rnn( char_lstm_cell, in_passage_char_repres, sequence_length=passage_char_lengths, dtype=tf.float32 )[0] # [batch_size*question_len, q_char_len, char_lstm_dim] passage_char_outputs = passage_char_outputs[:, -1, :] passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += char_lstm_dim #print('\n\n\n') #print (in_question_repres) #print('\n\n\n') in_question_repres = tf.concat(in_question_repres, 2) # [batch_size, question_len, dim] in_passage_repres = tf.concat(in_passage_repres, 2) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate)) else: in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate)) in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, highway_layer_num) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func( out_image_feats, in_question_repres, in_passage_repres, in_question_dep_cons, in_passage_dep_cons, self.question_lengths, self.passage_lengths, question_mask, mask, MP_dim, input_dim, with_filter_layer, context_layer_num, context_lstm_dim, is_training, dropout_rate, with_match_highway, aggregation_layer_num, aggregation_lstm_dim, highway_layer_num, with_aggregation_highway, with_lex_decomposition, lex_decompsition_dim, with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match, with_left_match, with_right_match, with_dep=False, with_image=with_image, with_mean_aggregation=True, image_with_hypothesis_only=image_with_hypothesis_only, with_img_attentive_match=with_img_attentive_match, with_img_full_match=with_img_full_match, with_img_maxpool_match=with_img_maxpool_match, with_img_max_attentive_match=with_img_max_attentive_match, image_context_layer=image_context_layer, img_dim=100) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - dropout_rate)) else: logits = tf.multiply(logits, (1 - dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example') # self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy') gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) #gold_matrix = tf.one_hot(self.truth, num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.arg_max(self.prob, 1) if optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif optimize_type == 'sgd': self.global_step = tf.Variable( 0, name='global_step', trainable=False) # Create a variable to track the global step. min_lr = 0.000001 self._lr_rate = tf.maximum( min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98)) self.train_op = tf.train.GradientDescentOptimizer( learning_rate=self._lr_rate).minimize(self.loss) elif optimize_type == 'ema': tvars = tf.trainable_variables() train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.loss) # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1. maintain_averages_op = ema.apply(tvars) # Create an op that will update the moving averages after each training # step. This is what we will use in place of the usual training op. with tf.control_dependencies([train_op]): self.train_op = tf.group(maintain_averages_op) elif optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tvars = tf.trainable_variables() l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def _build(self, in_passage_words, passage_lengths, in_question_words_soft, question_lengths, truth): """ truth: a int in [0 .. num_classes] indicating entailment """ num_classes = self.num_classes word_vocab = self.word_vocab is_training = self.is_training global_step = self.global_step options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) #in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, in_question_words_soft) # [batch_size, question_len, word_dim] in_question_word_repres = tx.utils.soft_sequence_embedding( self.word_embedding, in_question_words_soft) in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(in_question_words_soft) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if is_training: tvars = tf.trainable_variables() if self.options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops) return { "logits": logits, "prob": self.prob, "loss": self.loss, "correct": correct, "eval_correct": self.eval_correct, "predictions": self.predictions, }