def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== in_question_repres = [] # word and char in_passage_repres = [] # word and char input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] question_len = input_shape[1] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) passage_len = input_shape[1] p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim, input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1) question_char_outputs_bw = question_char_outputs_bw[:, 0, :] question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw]) question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim]) (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim, input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1) passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :] passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw]) passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim]) in_question_repres.append(question_char_outputs) in_passage_repres.append(passage_char_outputs) input_dim += 2*options.char_lstm_dim in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def create_siameseLSTM_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): """ """ options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.embedding = tf.placeholder( tf.float32, shape=word_vocab.word_vecs.shape) self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=self.embedding, dtype=tf.float32) # tf.constant(word_vocab.word_vecs) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) passage_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # ======BiLSTM context layer====== for i in range( options.context_layer_num): # support multiple context layer with tf.variable_scope('bilstm-layer-{}'.format(i)): # contextual lstm for both passage and question in_question_repres = tf.multiply( in_question_repres, tf.expand_dims(question_mask, axis=-1)) (question_context_representation_fw, question_context_representation_bw, in_question_repres) = layer_utils.my_lstm_layer( in_question_repres, options.context_lstm_dim, input_lengths=self.question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) # Encode the second sentence, using the same LSTM weights. tf.get_variable_scope().reuse_variables() in_passage_repres = tf.multiply( in_passage_repres, tf.expand_dims(passage_mask, axis=-1)) (passage_context_representation_fw, passage_context_representation_bw, in_passage_repres) = layer_utils.my_lstm_layer( in_passage_repres, options.context_lstm_dim, input_lengths=self.passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn) if options.lstm_out_type == 'mean': question_context_representation_fw = layer_utils.collect_mean_step_of_lstm( question_context_representation_fw) question_context_representation_bw = layer_utils.collect_mean_step_of_lstm( question_context_representation_bw) passage_context_representation_fw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_fw) passage_context_representation_bw = layer_utils.collect_mean_step_of_lstm( passage_context_representation_bw) elif options.lstm_out_type == 'end': question_context_representation_fw = layer_utils.collect_final_step_of_lstm( question_context_representation_fw, self.question_lengths - 1) question_context_representation_bw = question_context_representation_bw[:, 0, :] passage_context_representation_fw = layer_utils.collect_final_step_of_lstm( passage_context_representation_fw, self.passage_lengths - 1) passage_context_representation_bw = passage_context_representation_bw[:, 0, :] question_context_outputs = tf.concat( axis=1, values=[ question_context_representation_fw, question_context_representation_bw ]) passage_context_outputs = tf.concat( axis=1, values=[ passage_context_representation_fw, passage_context_representation_bw ]) (match_representation, match_dim) = match_utils.siameseLSTM_match_func( question_context_outputs, passage_context_outputs, options.context_lstm_dim) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)], dtype=tf.float32) b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32) w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return tvars = tf.trainable_variables() if self.options.lambda_l1 > 0.0: l1_loss = tf.add_n([ tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l1_loss if self.options.lambda_l2 > 0.0: # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) l2_loss = tf.add_n([ tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, lemma_vocab=None, is_training=True, global_step=None): options = self.options # ======word representation layer====== with tf.variable_scope("Input_Embedding_Layer"): if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) # self.kg_embedding = tf.get_variable("kg", trainable=True, regularizer=regularizer, # initializer=tf.constant(lemma_vocab.word_vecs), dtype=tf.float32) self.kg_embedding = tf.get_variable( "kg", shape=(lemma_vocab.word_vecs.shape[0], options.kg_dim), initializer=initializer, trainable=True, dtype=tf.float32) c_emb = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) q_emb = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) c_kg_emb = tf.nn.embedding_lookup(self.kg_embedding, self.in_passage_words_lemma) q_kg_emb = tf.nn.embedding_lookup(self.kg_embedding, self.in_question_words_lemma) if is_training: c_emb = tf.nn.dropout(c_emb, 1 - self.dropout) q_emb = tf.nn.dropout(q_emb, 1 - self.dropout) c_kg_emb = tf.nn.dropout(c_kg_emb, 1 - self.dropout) q_kg_emb = tf.nn.dropout(q_kg_emb, 1 - self.dropout) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] if options.with_char and char_vocab is not None: input_shape = tf.shape(self.in_question_chars) batch_size = input_shape[0] q_char_len = input_shape[2] input_shape = tf.shape(self.in_passage_chars) p_char_len = input_shape[2] char_dim = char_vocab.word_dim self.char_embedding = tf.get_variable( "char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32) in_question_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_question_chars ) # [batch_size, question_len, q_char_len, char_dim] in_question_char_repres = tf.reshape( in_question_char_repres, shape=[-1, q_char_len, char_dim]) question_char_lengths = tf.reshape(self.question_char_lengths, [-1]) quesiton_char_mask = tf.sequence_mask( question_char_lengths, q_char_len, dtype=tf.float32) # [batch_size*question_len, q_char_len] in_question_char_repres = tf.multiply( in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1)) in_passage_char_repres = tf.nn.embedding_lookup( self.char_embedding, self.in_passage_chars ) # [batch_size, passage_len, p_char_len, char_dim] in_passage_char_repres = tf.reshape( in_passage_char_repres, shape=[-1, p_char_len, char_dim]) passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1]) passage_char_mask = tf.sequence_mask( passage_char_lengths, p_char_len, dtype=tf.float32) # [batch_size*passage_len, p_char_len] in_passage_char_repres = tf.multiply( in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1)) question_char_outputs = conv(in_question_char_repres, self.options.char_lstm_dim, bias=True, activation=tf.nn.tanh, kernel_size=5, name="char_conv", reuse=False) question_char_outputs = tf.reduce_max(question_char_outputs, axis=1) question_char_outputs = tf.reshape( question_char_outputs, [batch_size, question_len, options.char_lstm_dim]) passage_char_outputs = conv(in_passage_char_repres, self.options.char_lstm_dim, bias=True, activation=tf.nn.tanh, kernel_size=5, name="char_conv", reuse=True) passage_char_outputs = tf.reduce_max(passage_char_outputs, axis=1) passage_char_outputs = tf.reshape( passage_char_outputs, [batch_size, passage_len, options.char_lstm_dim]) c_emb = tf.concat([c_emb, passage_char_outputs], axis=2) q_emb = tf.concat([q_emb, question_char_outputs], axis=2) c_mask = tf.sequence_mask( self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] q_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] with tf.variable_scope("Embedding_Encoder_Layer"): q_emb = tf.multiply(q_emb, tf.expand_dims(q_mask, axis=-1)) c_emb = tf.multiply(c_emb, tf.expand_dims(c_mask, axis=-1)) q_kg_emb = tf.multiply( q_kg_emb, tf.expand_dims(tf.cast(q_mask, tf.float32), axis=-1)) c_kg_emb = tf.multiply( c_kg_emb, tf.expand_dims(tf.cast(c_mask, tf.float32), axis=-1)) (q_fw, q_bw, q) = layer_utils.my_lstm_layer( q_emb, options.context_lstm_dim, input_lengths=self.question_lengths, scope_name="context_represent", reuse=False, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) (c_fw, c_bw, c) = layer_utils.my_lstm_layer(c_emb, options.context_lstm_dim, input_lengths=self.passage_lengths, scope_name="context_represent", reuse=True, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) q = tf.multiply(q, tf.expand_dims(q_mask, axis=-1)) c = tf.multiply(c, tf.expand_dims(c_mask, axis=-1)) if is_training: q = tf.nn.dropout(q, 1 - self.dropout) c = tf.nn.dropout(c, 1 - self.dropout) with tf.variable_scope('co-att', reuse=tf.AUTO_REUSE): s = tf.einsum("abd,acd->abc", c, q) # cRq, loss = Complex(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization') # cRq, loss, r = Analogy(c_kg_emb, q_kg_emb, c_mask, q_mask, options.scalar_dim, # options.kg_dim, options.relation_dim, loss_type='factorization') # cRq, loss = DisMult(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization') cRq, r = Rescal(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim) # if is_training: v = tf.get_variable("v", [1, 1, 1, options.relation_dim], dtype=tf.float32) score = tf.reduce_sum(cRq * v, axis=-1) s = s + options.lamda1 * score s = mask_relevancy_matrix(s, q_mask, c_mask) s_q = tf.nn.softmax(s, dim=1) self.v = v q2c = tf.einsum("abd,abc->acd", c, s_q) q2c_kg = tf.einsum("abd,abc->acd", c_kg_emb, s_q) q2c_kg_r = tf.einsum("abcr,abc->acr", cRq, s_q) s_c = tf.nn.softmax(s, dim=2) c2q = tf.einsum("abd,acb->acd", q, s_c) c2q_kg = tf.einsum("abd,acb->acd", q_kg_emb, s_c) c2q_kg_r = tf.einsum("abcr,abc->abr", cRq, s_c) with tf.variable_scope("Model_Encoder_Layer"): passage_inputs = tf.concat( [c2q, c, c2q * c, c - c2q, c_kg_emb, c2q_kg, c2q_kg_r], axis=2) question_inputs = tf.concat( [q2c, q, q2c * q, q - q2c, q_kg_emb, q2c_kg, q2c_kg_r], axis=2) passage_inputs = tf.layers.dense(inputs=passage_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=False) question_inputs = tf.layers.dense(inputs=question_inputs, units=2 * options.context_lstm_dim, activation=tf.nn.relu, use_bias=True, name='pro', reuse=True) question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( question_inputs, options.aggregation_lstm_dim, input_lengths=self.question_lengths, scope_name='aggregate_layer', reuse=False, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) question_inputs = cur_aggregation_representation # question_outputs_vec = tf.concat([fw_rep, bw_rep], axis=1) (fw_rep, bw_rep, cur_aggregation_representation) = layer_utils.my_lstm_layer( passage_inputs, options.aggregation_lstm_dim, input_lengths=self.passage_lengths, scope_name='aggregate_layer', reuse=True, is_training=is_training, dropout_rate=self.dropout, use_cudnn=options.use_cudnn) passage_inputs = cur_aggregation_representation question_inputs = tf.multiply(question_inputs, tf.expand_dims(q_mask, axis=-1)) passage_inputs = tf.multiply(passage_inputs, tf.expand_dims(c_mask, axis=-1)) if is_training: question_inputs = tf.nn.dropout(question_inputs, 1 - self.dropout) passage_inputs = tf.nn.dropout(passage_inputs, 1 - self.dropout) passage_outputs_mean = tf.div( tf.reduce_sum(passage_inputs, 1), tf.expand_dims(tf.cast(self.passage_lengths, tf.float32), -1)) question_outputs_mean = tf.div( tf.reduce_sum(question_inputs, 1), tf.expand_dims(tf.cast(self.question_lengths, tf.float32), -1)) passage_outputs_max = tf.reduce_max(passage_inputs, axis=1) question_outputs_max = tf.reduce_max(question_inputs, axis=1) passage_outputs_att = soft_attention_with_kg(passage_inputs, c_kg_emb, c2q_kg_r, c_mask, options.att_dim, scope="soft_att", reuse=False) question_outputs_att = soft_attention_with_kg(question_inputs, q_kg_emb, q2c_kg_r, q_mask, options.att_dim, scope="soft_att", reuse=True) question_outputs = tf.concat([ question_outputs_max, question_outputs_mean, question_outputs_att ], axis=1) passage_outputs = tf.concat([ passage_outputs_max, passage_outputs_mean, passage_outputs_att ], axis=1) match_representation = tf.concat( axis=1, values=[question_outputs, passage_outputs]) # ========Prediction Layer========= match_dim = int(match_representation.shape[1]) w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) if is_training: match_representation = tf.nn.dropout(match_representation, (1 - self.dropout)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - self.dropout)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return if options.loss_type == 'logistic': matrix = self.matrix * 2 - 1 matrix = mask_relevancy_4dmatrix(matrix, q_mask, c_mask) score = -1 * tf.log(tf.nn.sigmoid(matrix * cRq)) else: score = self.matrix - cRq score = 1 / 2 * score * score score = mask_relevancy_4dmatrix(score, q_mask, c_mask) KGE_loss = tf.reduce_sum(score, axis=-1) self.loss = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) self.loss = self.loss + options.lamda2 * tf.reduce_sum( tf.layers.flatten(KGE_loss)) tvars = tf.trainable_variables() if self.options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if tf.trainable_variables() if not 'embedding' in v.name ]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adagard': optimizer = tf.train.AdagradOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def create_mpcnn_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None): """ """ options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.embedding = tf.placeholder( tf.float32, shape=word_vocab.word_vecs.shape) self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=self.embedding, dtype=tf.float32) # tf.constant(word_vocab.word_vecs) in_question_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim] in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(self.in_question_words) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(self.in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) in_question_repres = tf.expand_dims( in_question_repres, -1) # [batch_size, question_len, word_dim, 1] in_passage_repres = tf.expand_dims( in_passage_repres, -1) # [batch_size, passage_len, word_dim, 1] # ======Multi-perspective CNN Matching====== filter_sizes = options.filter_sizes num_filters = options.num_filters poolings = list([tf.reduce_max, tf.reduce_min, tf.reduce_mean])[:options.num_poolings] W1 = [ tf.get_variable( "W1_%s" % i, initializer=tf.truncated_normal( [filter_sizes[i], input_dim, 1, num_filters[0]], stddev=0.1), dtype=tf.float32) for i in range(len(filter_sizes)) ] b1 = [ tf.get_variable("b1_%s" % i, initializer=tf.constant(0.01, shape=[num_filters[0]]), dtype=tf.float32) for i in range(len(filter_sizes)) ] W2 = [ tf.get_variable( "W2_%s" % i, initializer=tf.truncated_normal( [filter_sizes[i], input_dim, 1, num_filters[1]], stddev=0.1), dtype=tf.float32) for i in range(len(filter_sizes) - 1) ] b2 = [ tf.get_variable( "b2_%s" % i, initializer=tf.constant(0.01, shape=[num_filters[1], input_dim]), dtype=tf.float32) for i in range(len(filter_sizes) - 1) ] sent1_blockA = layer_utils.build_block_A( in_question_repres, filter_sizes, poolings, W1, b1, is_training ) # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A] sent2_blockA = layer_utils.build_block_A( in_passage_repres, filter_sizes, poolings, W1, b1, is_training ) # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A] sent1_blockB = layer_utils.build_block_B( in_question_repres, filter_sizes, poolings, W2, b2, is_training ) # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B] sent2_blockB = layer_utils.build_block_B( in_passage_repres, filter_sizes, poolings, W2, b2, is_training ) # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B] (match_representation, match_dim) = match_utils.mpcnn_match_func( sent1_blockA, sent2_blockA, sent1_blockB, sent2_blockB, poolings, filter_sizes, num_filters) #========Prediction Layer========= w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)], dtype=tf.float32) b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32) w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.nn.relu(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) self.predictions = tf.argmax(self.prob, 1) gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, self.truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) if not is_training: return if options.with_f1_metric: # acc, acc_op = tf.metrics.accuracy(labels=self.truth, predictions=self.predictions) precision, pre_op = tf.metrics.precision( labels=self.truth, predictions=self.predictions) recall, rec_op = tf.metrics.recall(labels=self.truth, predictions=self.predictions) f1 = 2 * precision * recall / (precision + recall + 1e-6) self.loss = self.loss - 0.1 * tf.reduce_mean(f1) tvars = tf.trainable_variables() if self.options.lambda_l1 > 0.0: l1_loss = tf.add_n([ tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l1_loss if self.options.lambda_l2 > 0.0: # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) l2_loss = tf.add_n([ tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops)
def _build(self, in_passage_words, passage_lengths, in_question_words_soft, question_lengths, truth): """ truth: a int in [0 .. num_classes] indicating entailment """ num_classes = self.num_classes word_vocab = self.word_vocab is_training = self.is_training global_step = self.global_step options = self.options # ======word representation layer====== in_question_repres = [] in_passage_repres = [] input_dim = 0 if word_vocab is not None: word_vec_trainable = True cur_device = '/gpu:0' if options.fix_word_vec: word_vec_trainable = False cur_device = '/cpu:0' with tf.device(cur_device): self.word_embedding = tf.get_variable( "word_embedding", trainable=word_vec_trainable, initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32) #in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, in_question_words_soft) # [batch_size, question_len, word_dim] in_question_word_repres = tx.utils.soft_sequence_embedding( self.word_embedding, in_question_words_soft) in_passage_word_repres = tf.nn.embedding_lookup( self.word_embedding, in_passage_words) # [batch_size, passage_len, word_dim] in_question_repres.append(in_question_word_repres) in_passage_repres.append(in_passage_word_repres) input_shape = tf.shape(in_question_words_soft) batch_size = input_shape[0] question_len = input_shape[1] input_shape = tf.shape(in_passage_words) passage_len = input_shape[1] input_dim += word_vocab.word_dim in_question_repres = tf.concat( axis=2, values=in_question_repres) # [batch_size, question_len, dim] in_passage_repres = tf.concat( axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] if is_training: in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate)) in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate)) mask = tf.sequence_mask(passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len] question_mask = tf.sequence_mask( question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len] # ======Highway layer====== if options.with_highway: with tf.variable_scope("input_highway"): in_question_repres = match_utils.multi_highway_layer( in_question_repres, input_dim, options.highway_layer_num) tf.get_variable_scope().reuse_variables() in_passage_repres = match_utils.multi_highway_layer( in_passage_repres, input_dim, options.highway_layer_num) # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1)) # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1)) # ========Bilateral Matching===== (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres, question_lengths, passage_lengths, question_mask, mask, input_dim, is_training, options=options) #========Prediction Layer========= # match_dim = 4 * self.options.aggregation_lstm_dim w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32) b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32) w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32) b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32) # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate)) logits = tf.matmul(match_representation, w_0) + b_0 logits = tf.tanh(logits) if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate)) logits = tf.matmul(logits, w_1) + b_1 self.prob = tf.nn.softmax(logits) gold_matrix = tf.one_hot(truth, num_classes, dtype=tf.float32) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix)) correct = tf.nn.in_top_k(logits, truth, 1) self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32)) self.predictions = tf.argmax(self.prob, 1) if is_training: tvars = tf.trainable_variables() if self.options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + self.options.lambda_l2 * l2_loss if self.options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.options.learning_rate) elif self.options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self.options.learning_rate) grads = layer_utils.compute_gradients(self.loss, tvars) grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # self.train_op = optimizer.apply_gradients(zip(grads, tvars)) if self.options.with_moving_average: # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops = [self.train_op, variables_averages_op] self.train_op = tf.group(*train_ops) return { "logits": logits, "prob": self.prob, "loss": self.loss, "correct": correct, "eval_correct": self.eval_correct, "predictions": self.predictions, }