def __init__(self, template_vocab=None, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, options=None, mode='ce_train'): ### # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode'. # it is different from 'mode_gen' in generator_utils.py # value of 'mode_gen' can be 'ce_train', 'loss', 'greedy' or 'sample' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab = word_vocab self.template_vocab = template_vocab ### # create placeholders self.create_placeholders(options) # create encoder if options.two_sent_inputs: # take two sentences as inputs self.encoder = matching_encoder_utils.MatchingEncoder( self, options, word_vocab=word_vocab, char_vocab=char_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab) else: # take one sentence as input self.encoder = encoder_utils.SeqEncoder(self, options, word_vocab=word_vocab, char_vocab=char_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab) # encode the input instance self.encode_dim, self.encode_hiddens, self.init_decoder_state = self.encoder.encode( is_training=is_training) # project to phrase representation if options.with_phrase_projection: phrase_projection_layer = phrase_projection_layer_utils.PhraseProjectionLayer( self) self.phrase_representations = phrase_projection_layer.project_to_phrase_representation( self.encode_hiddens) self.encode_dim = 2 * self.encode_dim else: self.phrase_representations = self.encode_hiddens self.phrase_idx = self.in_passage_words self.phrase_lengths = self.passage_lengths phrase_length_max = tf.shape(self.phrase_idx)[1] self.phrase_mask = tf.sequence_mask(self.phrase_lengths, phrase_length_max, dtype=tf.float32) loss_weights = tf.sequence_mask( self.answer_lengths, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator ### self.generator = generator_utils.CovCopyAttenGen(self, options, word_vocab) self.generator = generator_utils.CovCopyAttenGen( self, options, word_vocab, template_vocab) ### # calculate encoder_features self.encoder_features = self.generator.calculate_encoder_features( self.phrase_representations, self.encode_dim) if mode == 'decode': self.context_t_1 = tf.placeholder( tf.float32, [None, self.encode_dim], name='context_t_1') # [batch_size, encode_dim] self.coverage_t_1 = tf.placeholder( tf.float32, [None, None], name='coverage_t_1') # [batch_size, encode_dim] self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size] ( self.state_t, self.context_t, self.coverage_t, self.attn_dist_t, self.p_gen_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction ) = self.generator.decode_mode( ### word_vocab, options.beam_size, self.init_decoder_state, self.context_t_1, self.coverage_t_1, self.word_t, word_vocab, self.template_words, self.template_lengths, options.beam_size, self.init_decoder_state, self.context_t_1, self.coverage_t_1, self.word_t, ### self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': _, _, self.greedy_words = self.generator.train_mode( word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, loss_weights, mode_gen='greedy') # not buiding training op for this mode return elif mode in ( 'ce_train', 'evaluate', ): ### self.accu, self.loss, _ = self.generator.train_mode(self.question_template, template_vocab, word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.accu, self.loss, _ = self.generator.train_mode( word_vocab, self.template_words, self.template_lengths, self.encode_dim, self.phrase_representations, self.encoder_features, ### self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, loss_weights, mode_gen='ce_train') ### if mode == 'evaluate': return # not buiding training op for evaluation elif mode == 'rl_train': _, self.loss, _ = self.generator.train_mode( word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, loss_weights, mode_gen='loss') tf.get_variable_scope().reuse_variables() _, _, self.sampled_words = self.generator.train_mode( word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, None, mode_gen='sample') _, _, self.greedy_words = self.generator.train_mode( word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, None, mode_gen='greedy') elif mode == 'rl_train_for_phrase': _, self.loss, _ = self.generator.train_mode( word_vocab, self.encode_dim, self.phrase_representations, self.encoder_features, self.phrase_idx, self.phrase_mask, self.init_decoder_state, self.gen_input_words, self.in_answer_words, loss_weights, mode_gen='loss') if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab=None, POS_vocab=None, NER_vocab=None, options=None, \ has_ref=True, is_training=True): # is_training controls whether to use dropout and update parameters self.is_training = is_training # has_ref distinguish 'dev' evaluation from 'final test' evaluation self.has_ref = has_ref self.options = options self.word_vocab = word_vocab # separately encode passage and question self.passage_encoder = encoder_utils.SeqEncoder(options, word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab) self.question_encoder = encoder_utils.SeqEncoder(options, word_vocab, POS_vocab=POS_vocab, NER_vocab=NER_vocab, embed_reuse=True) with tf.variable_scope('passage'): passage_dim, passage_repre, passage_mask = self.passage_encoder.encode( is_training=is_training) with tf.variable_scope('question'): question_dim, question_repre, question_mask = self.question_encoder.encode( is_training=is_training) # modeling entities self.entity_starts = tf.placeholder(tf.int32, [None, None], 'entity_starts') self.entity_ends = tf.placeholder(tf.int32, [None, None], 'entity_ends') self.entity_lengths = tf.placeholder(tf.int32, [None], 'entity_lengths') batch_size = tf.shape(self.entity_starts)[0] entity_len_max = tf.shape(self.entity_starts)[1] entity_mask = tf.sequence_mask(self.entity_lengths, entity_len_max, dtype=tf.float32) # [batch, entity] entity_st_rep = operation_utils.collect_node( passage_repre, self.entity_starts) # [batch, entity, rep_dim] entity_ed_rep = operation_utils.collect_node( passage_repre, self.entity_ends) # [batch, entity, rep_dim] entity_rep = tf.concat([entity_st_rep, entity_ed_rep], axis=2) # [batch, entity, rep_dim * 2] entity_dim = passage_dim * 2 qfull_st_rep = question_repre[:, 0, :] # [batch, rep_dim] qfull_ed_rep = operation_utils.collect_final_step( question_repre, self.question_encoder.sequence_lengths - 1) # [batch, rep_dim] qfull_rep = tf.concat([qfull_st_rep, qfull_ed_rep], axis=1) # [batch, rep_dim * 2] qfull_dim = question_dim * 2 matching_results = [] rst_seq = self.perform_matching(entity_rep, entity_dim, entity_mask, question_repre, qfull_rep, question_dim, question_mask, scope_name='seq_match', options=options, is_training=is_training) matching_results.append(rst_seq) # encode entity representation with GRN if options.with_grn or options.with_gcn: # merge question representation into passage q4p_rep = tf.tile( tf.expand_dims(qfull_rep, 1), # [batch, 1, rep_dim * 2] [1, entity_len_max, 1]) # [batch, entity, rep_dim * 2] entity_rep = tf.concat([entity_rep, q4p_rep], axis=2) entity_dim = entity_dim + qfull_dim # compress before going to GRN merge_w = tf.get_variable('merge_w', [entity_dim, options.merge_dim]) merge_b = tf.get_variable('merge_b', [options.merge_dim]) entity_rep = tf.reshape(entity_rep, [-1, entity_dim]) entity_rep = tf.matmul(entity_rep, merge_w) + merge_b entity_rep = tf.reshape( entity_rep, [batch_size, entity_len_max, options.merge_dim]) entity_rep = entity_rep * tf.expand_dims(entity_mask, axis=-1) entity_dim = options.merge_dim # main part: encoding scope_name = 'GRN' if options.with_grn else 'GCN' with tf.variable_scope(scope_name): self.edges = tf.placeholder(tf.int32, [None, None, None], 'edges') self.edges_mask = tf.placeholder(tf.float32, [None, None, None], 'edges_mask') if options.with_grn: print("With Graph recurrent network as the graph encoder") self.graph_encoder = graph_encoder_utils.GraphEncoder( entity_rep, entity_mask, entity_dim, self.edges, self.edges_mask, is_training=is_training, options=options) else: print("With GCN as the graph encoder") self.graph_encoder = gcn_encoder_utils.GCNEncoder( entity_rep, entity_mask, entity_dim, self.edges, self.edges_mask, is_training=is_training, options=options) for i in range(options.num_grn_step): if options.grn_rep_type == 'hidden': entity_grn_rep = self.graph_encoder.grn_historys[ i] # [batch, entity, grn_dim] entity_grn_dim = options.grn_dim elif options.grn_rep_type == 'hidden_embed': entity_grn_rep = tf.concat( [self.graph_encoder.grn_historys[i], entity_rep], 2) # [batch, entity, grn_dim + merge_dim] entity_grn_dim = options.grn_dim + entity_dim else: assert False, '%s not supported yet' % options.grn_rep_type if options.with_multi_perspective: assert entity_grn_dim == question_dim rst_grn = self.perform_matching(entity_grn_rep, entity_grn_dim, entity_mask, question_repre, qfull_rep, question_dim, question_mask, scope_name='grn%d_match' % i, options=options, is_training=is_training) matching_results.append(rst_grn) self.candidates = tf.placeholder( tf.int32, [None, None, None], 'candidates') # [batch, c_num, c_occur] self.candidates_len = tf.placeholder(tf.float32, [None], 'candidates_len') # [batch] self.candidates_occur_mask = tf.placeholder( tf.float32, [None, None, None], 'candidates_occur_mask') # [batch, c_num, c_occur] # matching_results: list of [batch, cands] self.attn_dist = self.perform_integration(matching_results, scope_name='integration', options=options, is_training=is_training) cand_num = tf.shape(self.candidates)[1] self.topk_probs, self.topk_ids = tf.nn.top_k(self.attn_dist, k=cand_num, name='topK') self.out = tf.argmax(self.attn_dist, axis=-1, output_type=tf.int32) if not has_ref: return self.ref = tf.placeholder(tf.int32, [None], 'ref') self.accu = tf.reduce_sum( tf.cast(tf.equal(self.out, self.ref), dtype=tf.float32)) xent = -tf.reduce_sum( tf.one_hot(self.ref, cand_num) * tf.log(self.attn_dist), axis=-1) self.loss = tf.reduce_mean(xent) if not is_training: return with tf.variable_scope("training_op"), tf.device("/gpu:1"): if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) clipper = 50 if not options.__dict__.has_key( "max_grad_norm") else options.max_grad_norm print("Max gradient norm {}".format(clipper)) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab, char_vocab, pos_vocab, edgelabel_vocab, options, mode='train'): # the value of 'mode' can be: # 'train', # 'evaluate' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('train', ) else False self.options = options self.word_vocab = word_vocab self.char_vocab = char_vocab self.pos_vocab = pos_vocab # sequential encoder that can take 0 LSTM layers self.encoder = encoder_utils.SeqEncoder(options, word_vocab, char_vocab, pos_vocab) word_repres, word_dim, sentence_repres, sentence_dim, seq_mask = \ self.encoder.encode(is_training=is_training) # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] self.graph_encoder = graph_encoder_utils.GraphEncoder( options, word_repres, word_dim, sentence_repres, sentence_dim, seq_mask, edgelabel_vocab, is_training=is_training) # collect placeholders self.sentence_words = self.encoder.sentence_words self.sentence_lengths = self.encoder.sentence_lengths if options.with_char: self.sentence_chars = self.encoder.sentence_chars self.sentence_chars_lengths = self.encoder.sentence_chars_lengths if options.with_POS: self.sentence_POSs = self.encoder.sentence_POSs self.in_neigh_indices = self.graph_encoder.in_neighbor_indices self.in_neigh_edges = self.graph_encoder.in_neighbor_edges self.in_neigh_mask = self.graph_encoder.in_neighbor_mask self.out_neigh_indices = self.graph_encoder.out_neighbor_indices self.out_neigh_edges = self.graph_encoder.out_neighbor_edges self.out_neigh_mask = self.graph_encoder.out_neighbor_mask if options.forest_prob_aware and options.forest_type != '1best': self.in_neigh_prob = self.graph_encoder.in_neighbor_prob self.out_neigh_prob = self.graph_encoder.out_neighbor_prob self.entity_indices = tf.placeholder(tf.int32, [None, None, None], name="entity_indices") self.entity_indices_mask = tf.placeholder(tf.float32, [None, None, None], name="entity_indices_mask") # collect inputs for final classifier final_repres = self.graph_encoder.graph_hiddens final_shape = tf.shape(final_repres) batch_size = final_shape[0] sentence_size_max = final_shape[1] # [batch, 2, indices, sentence_dim] entity_repres = collect_by_indices(final_repres, self.entity_indices) entity_repres = entity_repres * tf.expand_dims( self.entity_indices_mask, axis=-1) # [batch, 2, sentence_dim] entity_repres = tf.reduce_mean(entity_repres, axis=2) # [batch, 2*sentence_dim] h_final = tf.reshape(entity_repres, [batch_size, 2 * sentence_dim]) ### regarding Zhang et al., EMNLP 2018 #h_sent = tf.reduce_max(final_repres, axis=1) #hsent_loss = None #if options.lambda_l2_hsent > 0.0: # hsent_loss = tf.reduce_mean( # tf.reduce_sum(h_sent * h_sent, axis=-1), axis=-1) #h_s = tf.reduce_max( # range_repres(final_repres, sentence_size_max, self.sbj_starts, self.sbj_ends), # axis=1) #h_o = tf.reduce_max( # range_repres(final_repres, sentence_size_max, self.obj_starts, self.obj_ends), # axis=1) #h_final = tf.concat([h_sent, h_s, h_o], axis=1) # [batch, sentence_dim*3] #h_final = tf.layers.dense(h_final, options.ffnn_size, name="ffnn_1", activation=tf.nn.relu) # [batch, ffnn_size] #h_final = tf.layers.dense(h_final, options.ffnn_size, name="ffnn_2", activation=tf.nn.relu) # [batch, ffnn_size] ## [batch, class_num] self.distribution = _clip_and_normalize( tf.layers.dense(h_final, options.num_relations, name="ffnn_out", activation=tf.nn.softmax), 1.0e-6) self.rsts = tf.argmax(self.distribution, axis=-1, output_type=tf.int32) ## calculating accuracy self.refs = tf.placeholder(tf.int32, [ None, ]) self.accu = tf.reduce_sum( tf.cast(tf.equal(self.rsts, self.refs), dtype=tf.float32)) ## calculating loss # xent: [batch] xent = -tf.reduce_sum(tf.one_hot(self.refs, options.num_relations) * tf.log(self.distribution), axis=-1) self.loss = tf.reduce_mean(xent) if mode != 'train': print('Return from here, just evaluate') return #if options.lambda_l2_hsent > 0.0: # self.loss += hsent_loss * options.lambda_l2_hsent clipper = 5 tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss += options.lambda_l2 * l2_loss if hasattr(options, "decay") and options.decay != "none": global_step = tf.Variable(0, trainable=False) if options.decay == 'piece': values, bounds = [ options.learning_rate, ], [] for i in range(10): values.append(values[-1] * 0.9) bounds.append(options.trn_bch_num * 10 * i) learning_rate = tf.train.piecewise_constant( global_step, bounds, values) elif options.decay == 'poly': decay_steps = options.trn_bch_num * options.max_epochs learning_rate = tf.train.polynomial_decay( options.learning_rate, global_step, decay_steps, end_learning_rate=0.1 * options.learning_rate, power=0.5) elif options.decay == 'cos': decay_steps = options.trn_bch_num * options.max_epochs learning_rate = tf.train.cosine_decay(options.learning_rate, global_step, decay_steps, alpha=0.1) else: assert False, 'not supported' else: global_step = None learning_rate = options.learning_rate if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) else: assert False, 'not supported optimize type' grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) extra_train_ops = [] train_ops = [train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab_enc, word_vocab_dec, char_vocab, Edgelabel_vocab, options=None, mode='ce_train'): # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode'. # it is different from 'mode_gen' in generator_utils.py # value of 'mode_gen' can be ['ce_loss', 'rl_loss', 'greedy' or 'sample'] self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab_enc = word_vocab_enc self.word_vocab_dec = word_vocab_dec self.create_placeholders(options) # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] with tf.variable_scope('graph_encoder'): self.encoder = graph_encoder_utils.GraphEncoder( word_vocab=word_vocab_enc, edge_label_vocab=Edgelabel_vocab, char_vocab=char_vocab, is_training=is_training, options=options, device_str='/gpu:1') with tf.variable_scope('src_encoder'), tf.device('/gpu:1'): self.src_encoder = encoder_utils.SeqEncoder( self, options, word_vocab=word_vocab_enc, char_vocab=char_vocab) self.src_hidden_dim, self.src_hiddens, self.src_decinit = \ self.src_encoder.encode(is_training=is_training) self.src_mask = self.src_encoder.passage_mask # ============== Choices of attention memory ================ if options.attention_type == 'hidden': self.encoder_dim = options.neighbor_vector_dim self.encoder_states = self.encoder.graph_hiddens elif options.attention_type == 'hidden_cell': self.encoder_dim = options.neighbor_vector_dim * 2 self.encoder_states = tf.concat( [self.encoder.graph_hiddens, self.encoder.graph_cells], 2) elif options.attention_type == 'hidden_embed': self.encoder_dim = options.neighbor_vector_dim + options.node_dim self.encoder_states = tf.concat([ self.encoder.graph_hiddens, self.encoder.node_representations ], 2) else: assert False, '%s not supported yet' % options.attention_type # ============== Choices of initializing decoder state ============= if options.way_init_decoder == 'src': new_c, new_h = self.src_decinit.c, self.src_decinit.h elif options.way_init_decoder == 'zero': new_c = tf.zeros( [self.encoder.batch_size, options.gen_hidden_size]) new_h = tf.zeros( [self.encoder.batch_size, options.gen_hidden_size]) elif options.way_init_decoder == 'avg': new_c = tf.reduce_mean(self.encoder.graph_cells, axis=1) new_h = tf.reduce_mean(self.encoder.graph_hiddens, axis=1) elif options.way_init_decoder == 'root': new_c = self.encoder.graph_cells[:, 0, :] new_h = self.encoder.graph_hiddens[:, 0, :] else: assert False, 'way to initial decoder (%s) not supported' % options.way_init_decoder self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) # prepare src-side input for decoder # prepare AMR-side input for decoder self.nodes = self.encoder.passage_nodes self.nodes_num = self.encoder.passage_nodes_size if options.with_char: self.nodes_chars = self.encoder.passage_nodes_chars self.nodes_chars_num = self.encoder.passage_nodes_chars_size self.nodes_mask = self.encoder.passage_nodes_mask self.in_neigh_indices = self.encoder.passage_in_neighbor_indices self.in_neigh_edges = self.encoder.passage_in_neighbor_edges self.in_neigh_mask = self.encoder.passage_in_neighbor_mask self.out_neigh_indices = self.encoder.passage_out_neighbor_indices self.out_neigh_edges = self.encoder.passage_out_neighbor_edges self.out_neigh_mask = self.encoder.passage_out_neighbor_mask loss_weights = tf.sequence_mask( self.answer_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator self.generator = generator_utils.CovCopyAttenGen( self, options, word_vocab_dec) # calculate encoder_features with variable_scope.variable_scope("encoder_feats"): self.encoder_features = self.generator.calculate_encoder_features( self.encoder_states, self.encoder_dim) with variable_scope.variable_scope("src_feats"): self.src_features = self.generator.calculate_encoder_features( self.src_hiddens, self.src_hidden_dim) if mode == 'decode': self.context_encoder_t_1 = tf.placeholder( tf.float32, [None, self.encoder_dim], name='context_encoder_t_1') # [batch_size, encoder_dim] self.context_src_t_1 = tf.placeholder( tf.float32, [None, self.src_hidden_dim], name='context_src_t_1') # [batch_size, src_dim] self.coverage_t_1 = tf.placeholder( tf.float32, [None, None], name='coverage_t_1') # [batch_size, encoder_dim] self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size] (self.state_t, self.context_encoder_t, self.context_src_t, self.coverage_t, self.attn_dist_t, self.p_gen_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction) = self.generator.decode_mode( word_vocab_dec, options.beam_size, self.init_decoder_state, self.context_encoder_t_1, self.context_src_t_1, self.coverage_t_1, self.word_t, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.src_hiddens, self.src_features, self.src_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': _, _, self.greedy_words = self.generator.train_mode( word_vocab_dec, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='greedy') # not buiding training op for this mode return elif mode in ( 'ce_train', 'evaluate', ): self.accu, self.loss, _ = self.generator.train_mode( word_vocab_dec, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='ce_loss') if mode == 'evaluate': return # not buiding training op for evaluation elif mode == 'rl_train': _, self.loss, _ = self.generator.train_mode( word_vocab_dec, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='rl_loss') tf.get_variable_scope().reuse_variables() _, _, self.sampled_words = self.generator.train_mode( word_vocab_dec, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, None, mode_gen='sample') _, _, self.greedy_words = self.generator.train_mode( word_vocab_dec, self.encoder_dim, self.encoder_states, self.encoder_features, self.nodes, self.nodes_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, None, mode_gen='greedy') if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None, options=None, mode='ce_train'): # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode', # 'topk'. # it is different from 'mode_gen' in soft_generator_utils.py # value of 'mode_gen' can be 'ce_loss', 'rl_loss', 'greedy', 'sample' or 'topk' self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab = word_vocab with tf.variable_scope('input_encoder'): self.input_encoder = encoder_utils.SeqEncoder( options, word_vocab=word_vocab, char_vocab=char_vocab, POS_vocab=POS_vocab) self.input_hidden_dim, self.input_hiddens, self.input_decinit = \ self.input_encoder.encode(is_training=is_training) self.input_mask = self.input_encoder.passage_mask with tf.variable_scope('concept_encoder'): options_copy = copy.copy(options) options_copy.with_char = False options_copy.with_POS = False options_copy.with_lemma = False self.concept_encoder = encoder_utils.SeqEncoder( options_copy, word_vocab=word_vocab, char_vocab=None, POS_vocab=None) self.concept_hidden_dim, self.concept_hiddens, self.concept_decinit = \ self.concept_encoder.encode(is_training=is_training) self.concept_mask = self.concept_encoder.passage_mask cat_c = tf.concat([self.input_decinit.c, self.concept_decinit.c], axis=1) cat_h = tf.concat([self.input_decinit.h, self.concept_decinit.h], axis=1) compress_w = tf.get_variable('compress_w', [ self.input_hidden_dim + self.concept_hidden_dim, options.gen_hidden_size ], dtype=tf.float32) compress_b = tf.get_variable('compress_b', [options.gen_hidden_size], dtype=tf.float32) cat_c = tf.matmul(cat_c, compress_w) + compress_b cat_h = tf.matmul(cat_h, compress_w) + compress_b self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(cat_c, cat_h) self.create_placeholders(options) gen_loss_mask = tf.sequence_mask( self.action_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator self.generator = soft_generator_utils.AttnGen( self, options, action_vocab, feat_vocab) # calculate encoder_features with variable_scope.variable_scope("input_feats"): self.input_features = self.generator.calculate_encoder_features( self.input_hiddens, self.input_hidden_dim) with variable_scope.variable_scope("concept_feats"): self.concept_features = self.generator.calculate_encoder_features( self.concept_hiddens, self.concept_hidden_dim) if mode == 'decode': # [batch_size, encode_dim] self.context_input_t_1 = tf.placeholder( tf.float32, [None, self.input_hidden_dim], name='context_input_t_1') # [batch_size, encode_dim] self.context_concept_t_1 = tf.placeholder( tf.float32, [None, self.concept_hidden_dim], name='context_concept_t_1') # [batch_size, feat_num] self.featidx_t = tf.placeholder(tf.int32, [None, None], name='featidx_t') # [batch_size] self.actionidx_t = tf.placeholder(tf.int32, [None], name='actionidx_t') (self.state_t, self.context_input_t, self.context_concept_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.sample_prediction) = self.generator.decode_mode( self.init_decoder_state, self.context_input_t_1, self.context_concept_t_1, self.actionidx_t, self.featidx_t, self.input_hiddens, self.input_features, self.input_mask, self.concept_hiddens, self.concept_features, self.concept_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': assert False, 'not in use' _, _, self.greedy_words = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='greedy') # not buiding training op for this mode return elif mode in ( 'ce_train', 'evaluate', ): self.accu, self.loss, self.sampled_words = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='ce_loss') if mode == 'evaluate': return # not buiding training op for evaluation elif mode == 'topk': self.accu, self.sampled_words = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='topk') return elif mode == 'rl_train': assert False, 'not in use' _, self.loss, _ = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='rl_loss') tf.get_variable_scope().reuse_variables() _, _, self.sampled_words = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='sample') _, _, self.greedy_words = self.generator.train_mode( self.input_hidden_dim, self.input_hiddens, self.input_features, self.input_mask, self.concept_hidden_dim, self.concept_hiddens, self.concept_features, self.concept_mask, self.init_decoder_state, self.action_inp, self.action_ref, self.feats, gen_loss_mask, mode_gen='greedy') else: assert False, 'unknow mode' if options.optimize_type == 'adadelta': clipper = 50 optimizer = tf.train.AdadeltaOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) elif options.optimize_type == 'adam': clipper = 50 optimizer = tf.train.AdamOptimizer( learning_rate=options.learning_rate) tvars = tf.trainable_variables() if options.lambda_l2 > 0.0: l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1 ]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def __init__(self, word_vocab_enc, word_vocab_dec, options=None, mode='ce_train'): # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode'. # it is different from 'mode_gen' in generator_utils.py # value of 'mode_gen' can be ['ce_loss', 'rl_loss', 'greedy' or 'sample'] self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab_enc = word_vocab_enc self.word_vocab_dec = word_vocab_dec self.create_placeholders(options) # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] with tf.variable_scope('linamr_encoder'): self.linamr_encoder = encoder_utils.SeqEncoder(options, word_vocab = word_vocab_enc) self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_decinit = \ self.linamr_encoder.encode(is_training=is_training) self.linamr_words = self.linamr_encoder.in_passage_words self.linamr_lengths = self.linamr_encoder.passage_lengths self.linamr_mask = self.linamr_encoder.passage_mask with tf.variable_scope('src_encoder'): self.src_encoder = encoder_utils.SeqEncoder(options, word_vocab=word_vocab_enc) self.src_hidden_dim, self.src_hiddens, self.src_decinit = \ self.src_encoder.encode(is_training=is_training) self.src_words = self.src_encoder.in_passage_words self.src_lengths = self.src_encoder.passage_lengths self.src_mask = self.src_encoder.passage_mask # ============== Choices of initializing decoder state ============= if options.way_init_decoder == 'src': new_c, new_h = self.src_decinit.c, self.src_decinit.h elif options.way_init_decoder == 'linamr': new_c, new_h = self.linamr_decinit.c, self.linamr_decinit.h elif options.way_init_decoder == 'zero': new_c = tf.zeros([self.encoder.batch_size, options.gen_hidden_size]) new_h = tf.zeros([self.encoder.batch_size, options.gen_hidden_size]) else: assert False, 'way to initial decoder (%s) not supported' % options.way_init_decoder self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) # prepare src-side input for decoder loss_weights = tf.sequence_mask(self.answer_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator self.generator = generator_utils.CovAttenGen(self, options, word_vocab_dec, is_training=is_training) # calculate encoder_features with variable_scope.variable_scope("encoder_feats"): self.linamr_features = self.generator.calculate_encoder_features( self.linamr_hiddens, self.linamr_hidden_dim) with variable_scope.variable_scope("src_feats"): self.src_features = self.generator.calculate_encoder_features( self.src_hiddens, self.src_hidden_dim) if mode == 'decode': self.context_encoder_t_1 = tf.placeholder(tf.float32, [None, self.linamr_hidden_dim], name='context_encoder_t_1') # [batch_size, encoder_dim] self.context_src_t_1 = tf.placeholder(tf.float32, [None, self.src_hidden_dim], name='context_src_t_1') # [batch_size, src_dim] if options.use_coverage: self.coverage_t_1 = tf.placeholder(tf.float32, [None, None], name='coverage_t_1') # [batch_size, encoder_dim] else: self.coverage_t_1 = None self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size] (self.state_t, self.context_encoder_t, self.context_src_t, self.coverage_t, self.attn_dist_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction) = \ self.generator.decode_mode( word_vocab_dec, options.beam_size, self.init_decoder_state, self.context_encoder_t_1, self.context_src_t_1, self.coverage_t_1, self.word_t, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hiddens, self.src_features, self.src_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': _, _, self.greedy_words = self.generator.train_mode(word_vocab_dec, self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='greedy') # not buiding training op for this mode return elif mode in ('ce_train', 'evaluate', ): self.accu, self.loss, _ = self.generator.train_mode(word_vocab_dec, self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='ce_loss') if mode == 'evaluate': return # not buiding training op for evaluation with tf.device('/gpu:1'): if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=options.learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=options.learning_rate) clipper = 50 if not options.__dict__.has_key("max_gradient_norm") else options.max_gradient_norm print("MAX gradient norm {}".format(clipper)) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)