def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True)
def train_action(opt, logger, writer, train_num): try: opt.rl_weight = 1 - opt.mle_weight if opt.load_model: opt.load_model = "/%s/%s" % (opt.word_emb_type, opt.load_model) logger.info(u'------Training Setting--------') logger.info("Traing Type :%s" % (config.data_type)) if opt.train_mle == True: logger.info("Training mle: %s, mle weight: %.2f" % (opt.train_mle, opt.mle_weight)) if opt.train_rl == True: logger.info("Training rl: %s, rl weight: %.2f \n" % (opt.train_rl, opt.rl_weight)) if opt.word_emb_type == 'bert': config.emb_dim = 768 if opt.pre_train_emb: logger.info('use pre_train_%s vocab_size %s \n' % (opt.word_emb_type, config.vocab_size)) else: logger.info('use %s vocab_size %s \n' % (opt.word_emb_type, config.vocab_size)) logger.info("intra_encoder: %s intra_decoder: %s \n" % (config.intra_encoder, config.intra_decoder)) if opt.word_emb_type in ['word2Vec', 'glove', 'FastText']: config.vocab_path = config.Data_path + "Embedding/%s/word.vocab" % ( opt.word_emb_type) # config.vocab_size = len(open(config.vocab_path).readlines()) vocab = Vocab(config.vocab_path, config.vocab_size) train_processor = Train(opt, vocab, logger, writer, train_num) train_processor.trainIters() except KeyError as e: print(e) traceback = sys.exc_info()[2] logger.error(sys.exc_info()) logger.error(traceback.tb_lineno) logger.error(e) logger.info(u'------Training END--------')
def __init__(self, model_file_or_model, vocab=None): if vocab is None: self.vocab = Vocab(config.vocab_path, config.vocab_size) else: assert isinstance(vocab, Vocab) self.vocab = vocab self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) if isinstance(model_file_or_model, str): self.model = Model(device, model_file_or_model, is_eval=True) elif isinstance(model_file_or_model, Model): self.model = model_file_or_model else: raise ValueError("Cannot build model from type %s" % type(model_file_or_model))
def __init__(self, model_file_path, vocab=None): if vocab != None: self.vocab = vocab else: self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=False) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def __init__(self): """ Input: vocab_path = "xxx/finished_files/vocab", vocab_size = 50000 Output: class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`) """ self.vocab = Vocab(config.vocab_path, config.vocab_size) """ Input: train_data_path = "xxx/finished_files/chunked/train_*", self.vocab: class object, mode = 'train', for training, batch_size = 8, single_pass = False Output: class object: self.vocab, (dicts `_word_to_id` and `_id_to_word`) """ self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def __init__(self, model_file_path=None): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) if not model_file_path: train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) else: train_dir = re.sub('/model/model.*', '', model_file_path) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.create_file_writer(train_dir)
def __init__(self, args, model_name = None): self.args = args vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path print(args.vocab_path) print(vocab) self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file) self.batcher = Batcher(args.train_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=False, args=args) self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='eval', batch_size=args.batch_size, single_pass=True, args=args) time.sleep(15) if model_name is None: self.train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) else: self.train_dir = os.path.join(config.log_root, model_name) if not os.path.exists(self.train_dir): os.mkdir(self.train_dir) self.model_dir = os.path.join(self.train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
print("Finished writing vocab file") # In[51]: word = wvmodel.wv.index2entity[25] vector = wvmodel.wv.vectors[25] print(word) # print(vector) # In[52]: import torch from data_util.data import Vocab vocab_size = len(wvmodel.vocab) + 1 vocab = Vocab('Embedding/category/word2Vec/word.vocab', vocab_size) embed_size = 300 weight = torch.zeros(vocab_size, embed_size) for i in range(len(vocab._id_to_word.keys())): try: vocab_word = vocab._id_to_word[i + 4] w2vec_word = w2vec.wv.index2entity[i] except Exception as e: continue if i + 4 > vocab_size: break # print(vocab_word,w2vec_word) weight[i + 4, :] = torch.from_numpy(w2vec.wv.vectors[i]) embedding = torch.nn.Embedding.from_pretrained(weight)
def __init__(self, vocab_path, model_path, model): self.vocab = vocab = Vocab(vocab_path) self.model_path = model_path self.model = model
class Train(object): def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.opt = opt self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) time.sleep(5) def save_model(self, iter): save_path = config.save_model_path + "/%07d.tar" % iter T.save( { "iter": iter + 1, "model_dict": self.model.state_dict(), "trainer_dict": self.trainer.state_dict() }, save_path) def setup_train(self): self.model = Model() self.model = get_cuda(self.model) self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr) start_iter = 0 if self.opt.load_model is not None: load_model_path = os.path.join(config.save_model_path, self.opt.load_model) checkpoint = T.load(load_model_path) start_iter = checkpoint["iter"] self.model.load_state_dict(checkpoint["model_dict"]) self.trainer.load_state_dict(checkpoint["trainer_dict"]) print("Loaded model at " + load_model_path) if self.opt.new_lr is not None: self.trainer = T.optim.Adam(self.model.parameters(), lr=self.opt.new_lr) return start_iter def train_batch_MLE(self, enc_out, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, batch): ''' Calculate Negative Log Likelihood Loss for the given batch. In order to reduce exposure bias, pass the previous generated token as input with a probability of 0.25 instead of ground truth label Args: :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size) :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size) :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf) :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism :param enc_batch_extend_vocab: Input batch that stores OOV ids :param batch: batch object ''' dec_batch, max_dec_len, dec_lens, target_batch = get_dec_data( batch) #Get input and target batchs for training decoder step_losses = [] s_t = (enc_hidden[0], enc_hidden[1]) #Decoder hidden states x_t = get_cuda(T.LongTensor(len(enc_out)).fill_( self.start_id)) #Input to the decoder prev_s = None #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf) sum_temporal_srcs = None #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf) for t in range(min(max_dec_len, config.max_dec_steps)): use_gound_truth = get_cuda((T.rand(len(enc_out)) > 0.25)).long( ) #Probabilities indicating whether to use ground truth labels instead of previous decoded tokens x_t = use_gound_truth * dec_batch[:, t] + ( 1 - use_gound_truth ) * x_t #Select decoder input based on use_ground_truth probabilities x_t = self.model.embeds(x_t) final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder( x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s) target = target_batch[:, t] log_probs = T.log(final_dist + config.eps) step_loss = F.nll_loss(log_probs, target, reduction="none", ignore_index=self.pad_id) step_losses.append(step_loss) x_t = T.multinomial(final_dist, 1).squeeze( ) #Sample words from final distribution which can be used as input in next time step is_oov = (x_t >= config.vocab_size ).long() #Mask indicating whether sampled word is OOV x_t = (1 - is_oov) * x_t.detach() + ( is_oov) * self.unk_id #Replace OOVs with [UNK] token losses = T.sum( T.stack(step_losses, 1), 1 ) #unnormalized losses for each example in the batch; (batch_size) batch_avg_loss = losses / dec_lens #Normalized losses; (batch_size) mle_loss = T.mean(batch_avg_loss) #Average batch loss return mle_loss def train_batch_RL(self, enc_out, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, article_oovs, greedy): '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation Args :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size) :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size) :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf) :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism :param enc_batch_extend_vocab: Input batch that stores OOV ids :param article_oovs: Batch containing list of OOVs in each example :param greedy: If true, performs greedy based sampling, else performs multinomial sampling Returns: :decoded_strs: List of decoded sentences :log_probs: Log probabilities of sampled words ''' s_t = enc_hidden #Decoder hidden states x_t = get_cuda(T.LongTensor(len(enc_out)).fill_( self.start_id)) #Input to the decoder prev_s = None #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf) sum_temporal_srcs = None #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf) inds = [] #Stores sampled indices for each time step decoder_padding_mask = [] #Stores padding masks of generated samples log_probs = [] #Stores log probabilites of generated samples mask = get_cuda( T.LongTensor(len(enc_out)).fill_(1) ) #Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise for t in range(config.max_dec_steps): x_t = self.model.embeds(x_t) probs, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder( x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s) if greedy is False: multi_dist = Categorical(probs) x_t = multi_dist.sample() #perform multinomial sampling log_prob = multi_dist.log_prob(x_t) log_probs.append(log_prob) else: _, x_t = T.max(probs, dim=1) #perform greedy sampling x_t = x_t.detach() inds.append(x_t) mask_t = get_cuda(T.zeros( len(enc_out))) #Padding mask of batch for current time step mask_t[ mask == 1] = 1 #If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0 mask[ (mask == 1) + (x_t == self.end_id) == 2] = 0 #If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0 decoder_padding_mask.append(mask_t) is_oov = (x_t >= config.vocab_size ).long() #Mask indicating whether sampled word is OOV x_t = (1 - is_oov) * x_t + ( is_oov) * self.unk_id #Replace OOVs with [UNK] token inds = T.stack(inds, dim=1) decoder_padding_mask = T.stack(decoder_padding_mask, dim=1) if greedy is False: #If multinomial based sampling, compute log probabilites of sampled words log_probs = T.stack(log_probs, dim=1) log_probs = log_probs * decoder_padding_mask #Not considering sampled words with padding mask = 0 lens = T.sum(decoder_padding_mask, dim=1) #Length of sampled sentence log_probs = T.sum( log_probs, dim=1 ) / lens # (bs,) #compute normalizied log probability of a sentence decoded_strs = [] for i in range(len(enc_out)): id_list = inds[i].cpu().numpy() oovs = article_oovs[i] S = data.outputids2words( id_list, self.vocab, oovs) # Generate sentence corresponding to sampled words try: end_idx = S.index(data.STOP_DECODING) S = S[:end_idx] except ValueError: S = S if len( S ) < 2: #If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE S = ["xxx"] S = " ".join(S) decoded_strs.append(S) return decoded_strs, log_probs def reward_function(self, decoded_sents, original_sents): rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: print( "Rouge failed for multi sentence evaluation.. Finding exact pair" ) scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: print("Error occured at:") print("decoded_sents:", decoded_sents[i]) print("original_sents:", original_sents[i]) score = [{"rouge-l": {"f": 0.0}}] scores.append(score[0]) rouge_l_f1 = [score["rouge-l"]["f"] for score in scores] rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1)) return rouge_l_f1 # def write_to_file(self, decoded, max, original, sample_r, baseline_r, iter): # with open("temp.txt", "w") as f: # f.write("iter:"+str(iter)+"\n") # for i in range(len(original)): # f.write("dec: "+decoded[i]+"\n") # f.write("max: "+max[i]+"\n") # f.write("org: "+original[i]+"\n") # f.write("Sample_R: %.4f, Baseline_R: %.4f\n\n"%(sample_r[i].item(), baseline_r[i].item())) def train_one_batch(self, batch, iter): enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data( batch) enc_batch = self.model.embeds( enc_batch) #Get embeddings for encoder input enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) # -------------------------------Summarization----------------------- if self.opt.train_mle == "yes": #perform MLE training mle_loss = self.train_batch_MLE(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch) else: mle_loss = get_cuda(T.FloatTensor([0])) # --------------RL training----------------------------------------------------- if self.opt.train_rl == "yes": #perform reinforcement learning training # multinomial sampling sample_sents, RL_log_probs = self.train_batch_RL( enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, greedy=False) with T.autograd.no_grad(): # greedy sampling greedy_sents, _ = self.train_batch_RL(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, greedy=True) sample_reward = self.reward_function(sample_sents, batch.original_abstracts) baseline_reward = self.reward_function(greedy_sents, batch.original_abstracts) # if iter%200 == 0: # self.write_to_file(sample_sents, greedy_sents, batch.original_abstracts, sample_reward, baseline_reward, iter) rl_loss = -( sample_reward - baseline_reward ) * RL_log_probs #Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf) rl_loss = T.mean(rl_loss) batch_reward = T.mean(sample_reward).item() else: rl_loss = get_cuda(T.FloatTensor([0])) batch_reward = 0 # ------------------------------------------------------------------------------------ self.trainer.zero_grad() (self.opt.mle_weight * mle_loss + self.opt.rl_weight * rl_loss).backward() self.trainer.step() return mle_loss.item(), batch_reward def trainIters(self): iter = self.setup_train() count = mle_total = r_total = 0 while iter <= config.max_iterations: batch = self.batcher.next_batch() try: mle_loss, r = self.train_one_batch(batch, iter) except KeyboardInterrupt: print( "-------------------Keyboard Interrupt------------------") exit(0) mle_total += mle_loss r_total += r count += 1 iter += 1 if iter % 50 == 0: mle_avg = mle_total / count r_avg = r_total / count logger.info("iter:" + str(iter) + " mle_loss:" + "%.3f" % mle_avg + " reward:" + "%.4f" % r_avg) count = mle_total = r_total = 0 if iter % 5000 == 0: self.save_model(iter)
class Validate(object): def __init__(self, data_path, batch_size=config.batch_size): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) time.sleep(5) def setup_valid(self): self.model = Model() self.model = get_cuda(self.model) checkpoint = T.load(config.load_model_path) self.model.load_state_dict(checkpoint["model_dict"]) def validate_batch(self): self.setup_valid() batch = self.batcher.next_batch() start_id = self.vocab.word2id(data.START_DECODING) end_id = self.vocab.word2id(data.STOP_DECODING) unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) decoded_sents = [] original_sents = [] rouge = Rouge() while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, c_t_1 = get_enc_data( batch) with T.autograd.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) with T.autograd.no_grad(): pred_ids = beam_search_on_batch(enc_hidden, enc_out, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id) for i in range(len(pred_ids)): decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) if len(decoded_words) < 2: decoded_words = "xxx" else: decoded_words = " ".join(decoded_words) decoded_sents.append(decoded_words) tar = batch.original_abstracts[i] original_sents.append(tar) batch = self.batcher.next_batch() load_file = config.load_model_path.split("/")[-1] scores = rouge.get_scores(decoded_sents, original_sents, avg=True) rouge_l = scores["rouge-l"]["f"] print(load_file, "rouge_l:", "%.4f" % rouge_l)
from data_util.batcher import Batcher from data_util.data import Vocab import numpy as np import json import tensorflow as tf import tensorflow_hub as hub import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import re import seaborn as sns #import nltk #from nltk.corpus import stopwords vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.train_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False) batches = 1 def google_encoder_metric(abstract_sents, article_sents): embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") rotation = 90 flatten = lambda l: [item for article_sents in l for item in article_sents] article_sentences = flatten(article_sents) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) #article_sentences = article_sentences[:3] #abstract_sents = abstract_sents[:2]
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis with torch.no_grad(): best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words(output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words print("===============SUMMARY=============") print(' '.join(decoded_words)) original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec'%(counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0, sent_lens = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) if config.use_maxpool_init_ctx: c_t_0 = max_encoder_output gamma = None if config.is_sentence_filtering: gamma, sent_dists = self.model.sentence_filterer(encoder_outputs, sent_lens) section_outputs, section_hidden = self.model.section_encoder(s_t_0) s_t_0 = self.model.section_reduce_state(section_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context = c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size)] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h =[] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, encoder_outputs, section_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, gamma) topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len(results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class Main(object): def __init__(self): self.vocab = Vocab(VOCAB_PATH, VOCAB_SIZE) self.batcher = Batcher(TRAIN_DATA_PATH, self.vocab, mode = 'train',batch_size = BATCH_SIZE, single_pass = False) self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) self.model = MyModel().to(DEVICE) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR) def beamSearch(self,enc_hid, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab): batch_size = len(enc_hid[0]) beam_idx = torch.LongTensor(list(range(batch_size))) beams = [Beam(self.start_id, self.end_id, self.unk_id, (enc_hid[0][i], enc_hid[1][i]), ct_e[i]) for i in range(batch_size)] n_rem = batch_size sum_exp = None prev_s = None for t in range(MAX_DEC_STEPS): x_t = torch.stack( [beam.getTokens() for beam in beams if beam.done == False] # remaining(rem),beam ).contiguous().view(-1) x_t = self.model.embeds(x_t) dec_h = torch.stack( [beam.hid_h for beam in beams if beam.done == False] ).contiguous().view(-1, HIDDEN_DIM) dec_c = torch.stack( [beam.hid_c for beam in beams if beam.done == False] ).contiguous().view(-1, HIDDEN_DIM) ct_e = torch.stack( [beam.context for beam in beams if beam.done == False] ).contiguous().view(-1, 2 * HIDDEN_DIM) if sum_exp is not None: sum_exp = torch.stack( [beam.sum_exp for beam in beams if beam.done == False] ).contiguous().view(-1, enc_out.size(1)) if prev_s is not None: prev_s = torch.stack( [beam.prev_s for beam in beams if beam.done == False] ) # try: prev_s = prev_s.contiguous().view(-1, t + 1, HIDDEN_DIM) # print(prev_s.shape) # except: # continue s_t = (dec_h, dec_c) enc_out_beam = enc_out[beam_idx].view(n_rem, -1).repeat(1, BEAM_SIZE).view(-1, enc_out.size(1), enc_out.size(2)) enc_pad_mask_beam = enc_padding_mask[beam_idx].repeat(1, BEAM_SIZE).view(-1, enc_padding_mask.size(1)) extra_zeros_beam = None if extra_zeros is not None: extra_zeros_beam = extra_zeros[beam_idx].repeat(1, BEAM_SIZE).view(-1, extra_zeros.size(1)) enc_extend_vocab_beam = enc_batch_extend_vocab[beam_idx].repeat(1, BEAM_SIZE).view(-1,enc_batch_extend_vocab.size(1)) # print(enc_out_beam.shape) # try: final_dist, (dec_h, dec_c), ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_out_beam,enc_pad_mask_beam, ct_e,extra_zeros_beam,enc_extend_vocab_beam,sum_exp,prev_s) # except: # continue # print(prev_s.shape) final_dist = final_dist.view(n_rem, BEAM_SIZE, -1) dec_h = dec_h.view(n_rem, BEAM_SIZE, -1) dec_c = dec_c.view(n_rem, BEAM_SIZE, -1) ct_e = ct_e.view(n_rem, BEAM_SIZE, -1) if sum_exp is not None: sum_exp = sum_exp.view(n_rem, BEAM_SIZE, -1) # rem, beam, n_seq if prev_s is not None: prev_s = prev_s.view(n_rem, BEAM_SIZE, -1, HIDDEN_DIM) # rem, beam, t # print("prev_s",prev_s.shape) active = [] for i in range(n_rem): b = beam_idx[i].item() beam = beams[b] if beam.done: continue sum_exp_i = prev_s_i = None if sum_exp is not None: sum_exp_i = sum_exp[i] if prev_s is not None: prev_s_i = prev_s[i] beam.advance(final_dist[i], (dec_h[i], dec_c[i]), ct_e[i], sum_exp_i, prev_s_i) if beam.done == False: active.append(b) # print(len(active)) if len(active) == 0: break beam_idx = torch.LongTensor(active) n_rem = len(beam_idx) predicted_words = [] for beam in beams: predicted_words.append(beam.getBest()) # print(len(predicted_words[0])) return predicted_words #获取encode的数据 def getEncData(self,batch): batch_size = len(batch.enc_lens) enc_batch = torch.from_numpy(batch.enc_batch).long() enc_padding_mask = torch.from_numpy(batch.enc_padding_mask).float() enc_lens = batch.enc_lens ct_e = torch.zeros(batch_size, 2 * HIDDEN_DIM) enc_batch = enc_batch.to(DEVICE) enc_padding_mask = enc_padding_mask.to(DEVICE) ct_e = ct_e.to(DEVICE) enc_batch_extend_vocab = None if batch.enc_batch_extend_vocab is not None: enc_batch_extend_vocab = torch.from_numpy(batch.enc_batch_extend_vocab).long() enc_batch_extend_vocab = enc_batch_extend_vocab.to(DEVICE) extra_zeros = None if batch.max_art_oovs > 0: extra_zeros = torch.zeros(batch_size, batch.max_art_oovs) extra_zeros = extra_zeros.to(DEVICE) return enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e #获取decode数据 def getDecData(self,batch): dec_batch = torch.from_numpy(batch.dec_batch).long() dec_lens = batch.dec_lens max_dec_len = np.max(dec_lens) dec_lens = torch.from_numpy(batch.dec_lens).float() target_batch = torch.from_numpy(batch.target_batch).long() dec_batch = dec_batch.to(DEVICE) dec_lens = dec_lens.to(DEVICE) target_batch = target_batch.to(DEVICE) return dec_batch, max_dec_len, dec_lens, target_batch #最大似然训练 def trainMLEStep(self, batch): enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch) # print(enc_lens) enc_batch = self.model.embeds(enc_batch) #将输入直接传入编码器 enc_output,enc_hidden = self.model.encoder(enc_batch,enc_lens) # print(enc_output.shape) dec_batch, max_dec_len, dec_lens, target_batch = self.getDecData(batch) total_loss = 0 s_t = (enc_hidden[0], enc_hidden[1]) x_t =torch.LongTensor(len(enc_output)).fill_(self.start_id).to(DEVICE) # print(x_t.shape) prev_s = None sum_exp = None #最大解码步数,每次解码一个单词 for t in range(min(max_dec_len, MAX_DEC_STEPS)): # print(max_dec_len) choice = (torch.rand(len(enc_output)) > 0.25).long().to(DEVICE) #选取部分decoder_input和一部分 x_t = choice * dec_batch[:, t] + (1 - choice) * x_t x_t = self.model.embeds(x_t) # print(x_t.shape) final_dist, s_t, ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_output, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_exp, prev_s) # print(prev_s.shape) target = target_batch[:, t] # print(target_batch.shape) log_probs = torch.log(final_dist + EPS) step_loss = F.nll_loss(log_probs, target, reduction="none", ignore_index=self.pad_id) total_loss = total_loss + step_loss x_t = torch.multinomial(final_dist, 1).squeeze() is_oov = (x_t >= VOCAB_SIZE).long() x_t = (1 - is_oov) * x_t.detach() + (is_oov) * self.unk_id batch_avg_loss = total_loss / dec_lens loss = torch.mean(batch_avg_loss) return loss #强化学习 def trainRLStep(self,enc_output, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, article_oovs, type): s_t = enc_hidden x_t = torch.LongTensor(len(enc_output)).fill_(self.start_id).to(DEVICE) prev_s = None sum_exp = None inds = [] decoder_padding_mask = [] log_probs = [] mask = torch.LongTensor(len(enc_output)).fill_(1).to(DEVICE) for t in range(MAX_DEC_STEPS): x_t = self.model.embeds(x_t) probs, s_t, ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_output, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_exp, prev_s) if type == "sample": #根据概率产生sample multi_dist = Categorical(probs) # print(multi_dist) x_t = multi_dist.sample() # print(x_t.shape) log_prob = multi_dist.log_prob(x_t) log_probs.append(log_prob) else: #greedy sample _, x_t = torch.max(probs, dim=1) x_t = x_t.detach() inds.append(x_t) mask_t = torch.zeros(len(enc_output)).to(DEVICE) mask_t[mask == 1] = 1 mask[(mask == 1) + (x_t == self.end_id) == 2] = 0 decoder_padding_mask.append(mask_t) is_oov = (x_t >= VOCAB_SIZE).long() #判断是否有超限,若有则用UNK x_t = (1 - is_oov) * x_t + (is_oov) * self.unk_id inds = torch.stack(inds, dim=1) decoder_padding_mask = torch.stack(decoder_padding_mask, dim=1) if type == "sample": log_probs = torch.stack(log_probs, dim=1) #将pad的去除 log_probs = log_probs * decoder_padding_mask lens = torch.sum(decoder_padding_mask, dim=1) #对应公式15 logp log_probs = torch.sum(log_probs,dim=1) / lens # print(log_prob.shape) decoded_strs = [] #将output的id转换为word for i in range(len(enc_output)): id_list = inds[i].cpu().numpy() oovs = article_oovs[i] S = data.outputids2words(id_list, self.vocab, oovs) # Generate sentence corresponding to sampled words try: end_idx = S.index(data.STOP_DECODING) S = S[:end_idx] except ValueError: S = S if len(S) < 2: S = ["xxx"] S = " ".join(S) decoded_strs.append(S) return decoded_strs, log_probs def rewardFunction(self, decoded_sents, original_sents): rouge = Rouge() scores = rouge.get_scores(decoded_sents, original_sents) rouge_l_f1 = [score["rouge-l"]["f"] for score in scores] rouge_l_f1 = (torch.FloatTensor(rouge_l_f1)).to(DEVICE) return rouge_l_f1 #利用beamSearch测试 def test(self): # time.sleep(5) batcher = Batcher(TEST_DATA_PATH, self.vocab, mode='test',batch_size=BATCH_SIZE, single_pass=True) batch = batcher.next_batch() decoded_sents = [] ref_sents = [] article_sents = [] rouge = Rouge() count = 0 while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch) with torch.autograd.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) with torch.autograd.no_grad(): pred_ids = self.beamSearch(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab) # print(len(pred_ids[0])) for i in range(len(pred_ids)): # print('t',pred_ids[i]) decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) # print(decoded_words) if len(decoded_words) < 2: decoded_words = "xxx" else: decoded_words = " ".join(decoded_words) decoded_sents.append(decoded_words) abstract = batch.original_abstracts[i] article = batch.original_articles[i] ref_sents.append(abstract) article_sents.append(article) # print(decoded_sents) batch = batcher.next_batch() scores = rouge.get_scores(decoded_sents, ref_sents, avg=True) #统计结果 if count == 1: k0_sum = scores[KEYS[0]] k1_sum = scores[KEYS[1]] k2_sum = scores[KEYS[2]] if count > 1: k0_sum = dict(Counter(Counter(k0_sum) + Counter(scores[KEYS[0]]))) k1_sum = dict(Counter(Counter(k1_sum) + Counter(scores[KEYS[1]]))) k2_sum = dict(Counter(Counter(k2_sum) + Counter(scores[KEYS[2]]))) if count == 10: break count += 1 # print(scores) print(KEYS[0], end=' ') for k in k0_sum: print(k,k0_sum[k] / count,end = ' ') print('\n') print(KEYS[1],end = ' ') for k in k1_sum: print(k,k1_sum[k] / count,end = ' ') print('\n') print(KEYS[2], end=' ') for k in k2_sum: print(k,k2_sum[k] / count,end = ' ') print('\n') def train(self): iter = 1 count = 0 total_loss = 0 total_reward = 0 while iter <= MAX_ITERATIONS: batch = self.batcher.next_batch() enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = self.getEncData(batch) enc_batch = self.model.embeds(enc_batch) # Get embeddings for encoder input enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) # print(enc_out.shape) #将enc_batch传入 mle_loss = self.trainMLEStep(batch) sample_sents, RL_log_probs = self.trainRLStep(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs,"sample") with torch.autograd.no_grad(): # greedy sampling greedy_sents, _ = self.trainRLStep(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, "greedy") sample_reward = self.rewardFunction(sample_sents, batch.original_abstracts) baseline_reward = self.rewardFunction(greedy_sents, batch.original_abstracts) #公式15 rl_loss = -(sample_reward - baseline_reward) * RL_log_probs rl_loss = torch.mean(rl_loss) batch_reward = torch.mean(sample_reward).item() loss = LAMBDA * mle_loss + (1 - LAMBDA) * rl_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() #计算批loss值 total_loss += loss.item() #计算批reward值 total_reward += batch_reward count += 1 iter += 1 if iter % PRINT_PER_ITER == 0: loss_avg = total_loss / count reward_avg = total_reward / count total_loss = 0 print("iter:", iter, "loss:", "%.3f" % loss_avg ,"reward:", "%.3f" % reward_avg) count = 0 if iter % TEST_PER_ITER == 0: self.test()
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.decode_dir, model_name) self._decode_dir = os.path.splitext(self._decode_dir)[0] self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: Path(p).mkdir(parents=True, exist_ok=True) self.vocab = Vocab(config.vocab_path, config.vocab_size) # self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', # batch_size=config.beam_size, single_pass=True) # time.sleep(15) self.get_batches(config.decode_pk_path) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def get_batches(self, path): """ load batches dumped by pickle see batch_saver.py for more information """ with open(path, 'rb') as f: batches = pickle.load(f, encoding="bytes") self.batches = batches print("loaded: {}".format(path)) def if_already_exists(self, idx): ref_file = os.path.join(self._rouge_ref_dir, "{}_reference.txt".format(idx)) decoded_file = os.path.join(self._rouge_dec_dir, "{}_decoded.txt".format(idx)) return os.path.isfile(ref_file) and os.path.isfile(decoded_file) def decode(self, file_id_start, file_id_stop): if file_id_stop > MAX_TEST_ID: file_id_stop = MAX_TEST_ID # while batch is not None: # do this for faster stack CPU machines - to replace those that fail!! idx_list = [i for i in range(file_id_start, file_id_stop)] random.shuffle(idx_list) for idx in idx_list: # check if this is written already if self.if_already_exists(idx): # print("ID {} already exists".format(idx)) continue # batch = self.batcher.next_batch() batch = self.batches[idx] # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, idx, self._rouge_ref_dir, self._rouge_dec_dir) print("decoded idx = {}".format(idx)) print("Finished decoding idx [{},{})".format(file_id_start, file_id_stop)) # print("Starting ROUGE eval...") # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) # rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): # batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) if not config.is_hierarchical: encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) else: stop_id = self.vocab.word2id('.') enc_sent_pos = get_sent_position(enc_batch, stop_id) encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask = \ self.model.encoder(enc_batch, enc_lens, enc_sent_pos) s_t_0, _ = self.model.reduce_state(encoder_hidden, sent_enc_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) if not config.is_hierarchical: final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) else: final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, enc_sent_pos, encoder_outputs, encoder_feature, enc_padding_mask, None, sent_enc_outputs, sent_enc_feature, sent_enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class Decoder(object): def __init__(self): self.vocab = Vocab(args.vocab_path, args.vocab_size) self.batcher = Batcher( args.decode_data_path, self.vocab, mode='decode', batch_size=1, single_pass=True) # support only 1 item at a time time.sleep(15) vocab_size = self.vocab.size() self.beam_size = args.beam_size # self.bertClient = BertClient() self.encoder = EncoderLSTM(args.hidden_size, self.vocab.size()) self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size()) if use_cuda: self.encoder = self.encoder.cuda() self.decoder = self.decoder.cuda() # Prepare the output folder and files output_dir = os.path.join(args.logs, "outputs") if not os.path.exists(output_dir): os.mkdir(output_dir) output_file = os.path.join(output_dir, "decoder_{}.txt".format(args.output_name)) self.file = open(output_file, "w+") def load_model(self, checkpoint_file): print("Loading Checkpoint: ", checkpoint_file) checkpoint = torch.load(checkpoint_file) encoder_state_dict = checkpoint['encoder_state_dict'] decoder_state_dict = checkpoint['decoder_state_dict'] self.encoder.load_state_dict(encoder_state_dict) self.decoder.load_state_dict(decoder_state_dict) self.encoder.eval() self.decoder.eval() print("Weights Loaded") def decode(self): batch = self.batcher.next_batch() count = 0 while batch is not None: # Preparing enc_batch, enc_padding_mask, enc_lens = get_input_from_batch( batch, use_cuda=use_cuda) dec_batch, target_batch, dec_padding_mask, max_dec_len = get_output_from_batch( batch, use_cuda=use_cuda) batch_size = len(enc_batch) proba_list = [0] * batch_size generated_list = [[]] # Encoding sentences outputs, hidden_state = self.encoder(enc_batch) x = torch.LongTensor([2]) # start sequence if use_cuda: x = x.cuda() """ Normal Approach """ #answers = torch.ones((batch_size, args.max_dec_steps), dtype=torch.long) #for t in range(max(args.max_dec_steps, max_dec_len)): # output, hidden_state = self.decoder(x, hidden_state) # Output: batch * vocab_size (prob.) # idx = torch.argmax(output, dim=1) # answers[:, t] = idx.detach() # x = idx """ Beam Approach """ for t in range(max(args.max_dec_steps, max_dec_len) - 1): output, hidden_state = self.decoder(x, hidden_state) # For each sentence, find b best answers (beam search) states = [] # (probab, generated, hidden_state_index) for i, each_decode in enumerate(output): prev_proba = proba_list[i] prev_generated = generated_list[i] arr = each_decode.detach().cpu().numpy( ) # log-probab of each word indices = arr.argsort()[-self.beam_size:][::-1] #index for idx in indices: proba = arr[idx] + proba_list[i] # new probab and prev generated = prev_generated.copy() generated.append(idx) states.append((proba, generated, i)) # Sort for the best generated sequence among all states.sort(key=lambda x: x[0], reverse=True) # Variables new_proba_list = [] new_generated = [] new_hidden = torch.Tensor() new_cell = torch.Tensor() new_x = torch.LongTensor() if use_cuda: new_hidden = new_hidden.cuda() new_cell = new_cell.cuda() new_x = new_x.cuda() # Select top b sequences for state in states[:self.beam_size]: new_proba_list.append(state[0]) new_generated.append(state[1]) idx = state[2] h_0 = hidden_state[0].squeeze(0)[idx].unsqueeze(0) c_0 = hidden_state[1].squeeze(0)[idx].unsqueeze(0) new_hidden = torch.cat((new_hidden, h_0), dim=0) new_cell = torch.cat((new_cell, c_0), dim=0) generated_idx = torch.LongTensor([state[1][-1]]) if use_cuda: generated_idx = generated_idx.cuda() new_x = torch.cat((new_x, generated_idx)) # Save the list proba_list = new_proba_list generated_list = new_generated hidden_state = (new_hidden.unsqueeze(0), new_cell.unsqueeze(0)) x = new_x # Convert from id to word # answer = answers[0].numpy() answer = new_generated[0] sentence = ids2words(answer, self.vocab) self.file.write("{}\n".format(sentence)) print("Writing line #{} to file ...".format(count + 1)) self.file.flush() sys.stdout.flush() count += 1 batch = self.batcher.next_batch()
def __init__(self, data_path, opt, batch_size = 1): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt = opt
class Evaluate(object): def __init__(self, data_path, opt, batch_size=config.batch_size): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt = opt time.sleep(5) def setup_valid(self): self.model = Model() self.model = get_cuda(self.model) checkpoint = T.load( os.path.join(config.save_model_path, self.opt.load_model)) self.model.load_state_dict(checkpoint["model_dict"]) # save the light version of picked model ''' print(config.save_model_path,' ','light_'+self.opt.load_model) save_path = config.save_model_path + '/light_'+self.opt.load_model print(save_path) T.save({ "model_dict": self.model.state_dict(), }, save_path) exit() ''' #-- end -- def print_original_predicted(self, decoded_sents, ref_sents, article_sents, loadfile): filename = "test_" + loadfile.split(".")[0] + ".txt" with open(os.path.join("data", filename), "w") as f: for i in range(len(decoded_sents)): f.write("article: " + article_sents[i] + "\n") f.write("ref: " + ref_sents[i] + "\n") f.write("dec: " + decoded_sents[i] + "\n\n") def evaluate_batch(self, print_sents=False): self.setup_valid() batch = self.batcher.next_batch() start_id = self.vocab.word2id(data.START_DECODING) end_id = self.vocab.word2id(data.STOP_DECODING) unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) decoded_sents = [] ref_sents = [] article_sents = [] rouge = Rouge() while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data( batch) with T.autograd.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) #-----------------------Summarization---------------------------------------------------- with T.autograd.no_grad(): pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id) for i in range(len(pred_ids)): decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) if len(decoded_words) < 2: decoded_words = "xxx" else: decoded_words = " ".join(decoded_words) decoded_sents.append(decoded_words) abstract = batch.original_abstracts[i] article = batch.original_articles[i] ref_sents.append(abstract) article_sents.append(article) batch = self.batcher.next_batch() load_file = self.opt.load_model if print_sents: self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file) scores = rouge.get_scores(decoded_sents, ref_sents, avg=True) if self.opt.task == "test": print(load_file, "scores:", scores) else: rouge_l = scores["rouge-l"]["f"] print(load_file, "rouge_l:", "%.4f" % rouge_l) with open("test_rg.txt", "a") as f: f.write("\n" + load_file + " - rouge_l: " + str(rouge_l)) f.close()
class Evaluate(object): def __init__(self, data_path, opt, batch_size=config.batch_size): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt =opt time.sleep(5) def setup_valid(self): self.model = Model() self.model = get_cuda(self.model) checkpoint = torch.load(os.path.join(config.save_model_path, self.opt.load_model)) # 加载在train中保存得模型 self.model.load_state_dict(checkpoint['model_dict']) def print_original_predicted(self, decoded_sents, ref_sents, article_sents, loadfile): # 这里可能会存在一点问题,debug得时候剋注意一下 filename = 'test_' + loadfile.split('.')[:-1] + '.txt' with open(os.path.join('data', filename), 'w') as f: for i in range(len(decoded_sents)): f.write('article' + article_sents[i] + '\n') f.write('reference:' + ref_sents[i] + '\n') f.write('decoder:' + decoded_sents[i] + '\n') def evaluate_batch(self, print_sents =False): self.setup_valid() batch = self.batcher.next_batch() start_id = self.vocab.word2id(data.START_DECODING) end_id = self.vocab.word2id(data.STOP_DECODING) unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) decoded_sents = [] ref_sents = [] article_sents = [] rouge = Rouge() batch_number = 0 while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, \ extra_zeros, ct_e = get_enc_data(batch) with torch.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) with torch.no_grad(): pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id) for i in range(len(pred_ids)): # 返回的是一个 单词列表。 decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) if len(decoded_words) < 2: decoded_words = 'xxx' else: decoded_words = ' '.join(decoded_words) decoded_sents.append(decoded_words) summary = batch.original_summarys[i] article = batch.original_articles[i] ref_sents.append(summary) article_sents.append(article) batch = self.batcher.next_batch() batch_number += 1 if batch_number < 100: continue else: break load_file = self.opt.load_model if print_sents: self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file) scores = rouge.get_scores(decoded_sents, ref_sents, avg=True) if self.opt.task == 'test': print(load_file, 'scores:', scores) sys.stdout.flush() else: rouge_l = scores['rouge-l']['f'] print(load_file, 'rouge-l:', '%.4f' % rouge_l)
word = wvmodel.wv.index2entity[25] vector = wvmodel.wv.vectors[25] print(word) # print(vector) # In[ ]: import torch from data_util.data import Vocab vocab_size = len(wvmodel.vocab) + 1 vocab = Vocab('Embedding/main_cat/word2Vec/word.vocab', vocab_size) embed_size = 300 weight = torch.zeros(vocab_size, embed_size) for i in range(len(vocab._id_to_word.keys())): try: vocab_word = vocab._id_to_word[i+4] w2vec_word = w2vec.wv.index2entity[i] except Exception as e : continue if i + 4 > vocab_size: break # print(vocab_word,w2vec_word) weight[i+4, :] = torch.from_numpy(w2vec.wv.vectors[i]) embedding = torch.nn.Embedding.from_pretrained(weight)
class Evaluate(object): def __init__(self, data_path, opt, batch_size=config.batch_size): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt = opt time.sleep(5) def setup_valid(self): self.model = Model() self.model = get_cuda(self.model) if T.cuda.is_available(): map_location = T.device('cuda') else: map_location = T.device('cpu') checkpoint = T.load( os.path.join(config.save_model_path, self.opt.load_model), map_location) self.model.load_state_dict(checkpoint["model_dict"]) # mlflow.pytorch.save_model(self.model,config.save_model_path+'_2') # mlflow.pytorch.load_model(config.save_model_path+'_2') def print_original_predicted(self, decoded_sents, ref_sents, article_sents, loadfile): filename = "test_" + loadfile.split(".")[0] + ".txt" with open(os.path.join("data", filename), "w") as f: for i in range(len(decoded_sents)): f.write("article: " + article_sents[i] + "\n") f.write("ref: " + ref_sents[i] + "\n") f.write("dec: " + decoded_sents[i] + "\n\n") def evaluate_batch(self, print_sents=False): self.setup_valid() batch = self.batcher.next_batch() start_id = self.vocab.word2id(data.START_DECODING) end_id = self.vocab.word2id(data.STOP_DECODING) unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) decoded_sents = [] ref_sents = [] article_sents = [] rouge = Rouge() while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data( batch) with T.autograd.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) #-----------------------Summarization---------------------------------------------------- with T.autograd.no_grad(): pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id) for i in range(len(pred_ids)): decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) if len(decoded_words) < 2: decoded_words = "xxx" else: decoded_words = " ".join(decoded_words) decoded_sents.append(decoded_words) abstract = batch.original_abstracts[i] article = batch.original_articles[i] ref_sents.append(abstract) article_sents.append(article) article_art_oovs = batch.art_oovs[i] #batch = self.batcher.next_batch() break load_file = self.opt.load_model # just a model name #if print_sents: # self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file) Batcher.article_summary = decoded_sents[0] Batcher.oovs = " ".join(article_art_oovs) # print('Article: ',article_sents[0], '\n==> Summary: [',decoded_sents[0],']\nOut of vocabulary: ', " ".join(article_art_oovs),'\nModel used: ', load_file) scores = 0 #rouge.get_scores(decoded_sents, ref_sents, avg = True) if self.opt.task == "test": print('Done.') #print(load_file, "scores:", scores) else: rouge_l = scores["rouge-l"]["f"]
def main(): vocab_file = os.path.join(vocab_path, 'vocab3.txt') vocab = Vocab(vocab_file) loader = WordVectorLoader(embed_dim=200) loader.create_embedding_matrix(embeddings_file_name, vocab._word_to_id) loader.save_embed_mat(embed_path)
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.concept_vocab = Concept_vocab(config.concept_vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, self.concept_vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.ds_batcher = Batcher(config.train_ds_data_path, self.vocab, self.concept_vocab, mode='train', batch_size=500, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = AdagradCustom( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def calc_Rouge_1(self, sub, string): new_sub = [str(x) for x in sub] new_sub.insert(0, '"') new_sub.append('"') token_c = ' '.join(new_sub) summary = [[token_c]] new_string = [str(x) for x in string] new_string.insert(0, '"') new_string.append('"') token_r = ' '.join(new_string) reference = [[[token_r]]] rouge = Pythonrouge(summary_file_exist=False, summary=summary, reference=reference, n_gram=2, ROUGE_SU4=False, ROUGE_L=False, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=30, use_cf=False, cf=95, scoring_formula='average', resampling=False, samples=10, favor=True, p=0.5) score = rouge.calc_score() return score['ROUGE-1'] def calc_Rouge_2_recall(self, sub, string): token_c = sub token_r = string model = [] ref = [] if len(string) == 1 or len(string) == 1: score = 0.0 else: i = 1 while i < len(string): ref.append(str(token_r[i - 1]) + str(token_r[i])) i += 1 i = 1 while i < len(sub): model.append(str(token_c[i - 1]) + str(token_c[i])) i += 1 sam = 0 i = 0 for i in range(len(ref)): for j in range(len(model)): if ref[i] == model[j]: sam += 1 model[j] = '-1' break score = sam / float(len(ref)) return score def calc_Rouge_L(self, sub, string): beta = 1. token_c = sub token_r = string if (len(string) < len(sub)): sub, string = string, sub lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)] for j in range(1, len(sub) + 1): for i in range(1, len(string) + 1): if (string[i - 1] == sub[j - 1]): lengths[i][j] = lengths[i - 1][j - 1] + 1 else: lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) lcs = lengths[len(string)][len(sub)] prec = lcs / float(len(token_c)) rec = lcs / float(len(token_r)) if (prec != 0 and rec != 0): score = ((1 + beta**2) * prec * rec) / float(rec + beta**2 * prec) else: score = 0.0 return rec def calc_kl(self, dec, enc): kl = 0. dec = np.exp(dec) enc = np.exp(enc) all_dec = np.sum(dec) all_enc = np.sum(enc) for d, c in zip(dec, enc): d = d / all_dec c = c / all_enc kl = kl + c * np.log(c / d) return kl def calc_euc(self, dec, enc): euc = 0. for d, c in zip(dec, enc): euc = euc + np.sqrt(np.square(d - c)) #print euc return euc def ds_loss(self, enc_batch_ds_emb, enc_padding_mask_ds, dec_batch_emb, dec_padding_mask): b1, t_k1, emb1 = list(enc_batch_ds_emb.size()) b2, t_k2, emb2 = list(dec_batch_emb.size()) enc_padding_mask_ds = enc_padding_mask_ds.unsqueeze(2).expand( b1, t_k1, emb1).contiguous() dec_padding_mask = dec_padding_mask.unsqueeze(2).expand( b2, t_k2, emb2).contiguous() enc_batch_ds_emb = enc_batch_ds_emb * enc_padding_mask_ds dec_batch_emb = dec_batch_emb * dec_padding_mask enc_batch_ds_emb = torch.sum(enc_batch_ds_emb, 1) dec_batch_emb = torch.sum(dec_batch_emb, 1) dec_title = dec_batch_emb.tolist() enc_article = enc_batch_ds_emb.tolist() dec_title_len = len(dec_title) enc_article_len = len(enc_article) dsloss = 0. for dec in dec_title: for enc in enc_article: dsloss = dsloss + self.calc_kl(dec, enc) dsloss = dsloss / float(dec_title_len * enc_article_len) print(dsloss) return dsloss def train_one_batch(self, batch, steps, batch_ds): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, enc_batch_concept_extend_vocab, concept_p, position, concept_mask, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) enc_batch_ds, enc_padding_mask_ds, enc_lens_ds, _, _, _, _, _, _, _, _ = \ get_input_from_batch(batch_ds, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_hidden, max_encoder_output, enc_batch_ds_emb, dec_batch_emb = self.model.encoder( enc_batch, enc_lens, enc_batch_ds, dec_batch) if config.DS_train: ds_final_loss = self.ds_loss(enc_batch_ds_emb, enc_padding_mask_ds, dec_batch_emb, dec_padding_mask) s_t_1 = self.model.reduce_state(encoder_hidden) s_t_0 = s_t_1 c_t_0 = c_t_1 if config.use_maxpool_init_ctx: c_t_1 = max_encoder_output c_t_0 = c_t_1 step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( 'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, enc_batch_concept_extend_vocab, concept_p, position, concept_mask, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var if config.DS_train: ds_final_loss = Variable(torch.FloatTensor([ds_final_loss]), requires_grad=False) ds_final_loss = ds_final_loss.cuda() loss = (config.pi - ds_final_loss) * torch.mean(batch_avg_loss) else: loss = torch.mean(batch_avg_loss) if steps > config.traintimes: scores = [] sample_y = [] s_t_1 = s_t_0 c_t_1 = c_t_0 for di in range(min(max_dec_len, config.max_dec_steps)): if di == 0: y_t_1 = dec_batch[:, di] sample_y.append(y_t_1.cpu().numpy().tolist()) else: sample_latest_tokens = sample_y[-1] sample_latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in sample_latest_tokens] y_t_1 = Variable(torch.LongTensor(sample_latest_tokens)) y_t_1 = y_t_1.cuda() final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( 'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, enc_batch_concept_extend_vocab, concept_p, position, concept_mask, coverage, di) sample_select = torch.multinomial(final_dist, 1).view(-1) sample_log_probs = torch.gather( final_dist, 1, sample_select.unsqueeze(1)).squeeze() sample_y.append(sample_select.cpu().numpy().tolist()) sample_step_loss = -torch.log(sample_log_probs + config.eps) sample_step_mask = dec_padding_mask[:, di] sample_step_loss = sample_step_loss * sample_step_mask scores.append(sample_step_loss) sample_sum_losses = torch.sum(torch.stack(scores, 1), 1) sample_batch_avg_loss = sample_sum_losses / dec_lens_var sample_y = np.transpose(sample_y).tolist() base_y = [] s_t_1 = s_t_0 c_t_1 = c_t_0 for di in range(min(max_dec_len, config.max_dec_steps)): if di == 0: y_t_1 = dec_batch[:, di] base_y.append(y_t_1.cpu().numpy().tolist()) else: base_latest_tokens = base_y[-1] base_latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in base_latest_tokens] y_t_1 = Variable(torch.LongTensor(base_latest_tokens)) y_t_1 = y_t_1.cuda() final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( 'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, enc_batch_concept_extend_vocab, concept_p, position, concept_mask, coverage, di) base_log_probs, base_ids = torch.topk(final_dist, 1) base_y.append(base_ids[:, 0].cpu().numpy().tolist()) base_y = np.transpose(base_y).tolist() refs = dec_batch.cpu().numpy().tolist() sample_dec_lens_var = map(int, dec_lens_var.cpu().numpy().tolist()) sample_rougeL = [ self.calc_Rouge_L(sample[:reflen], ref[:reflen]) for sample, ref, reflen in zip( sample_y, refs, sample_dec_lens_var) ] base_rougeL = [ self.calc_Rouge_L(base[:reflen], ref[:reflen]) for base, ref, reflen in zip(base_y, refs, sample_dec_lens_var) ] sample_rougeL = Variable(torch.FloatTensor(sample_rougeL), requires_grad=False) base_rougeL = Variable(torch.FloatTensor(base_rougeL), requires_grad=False) sample_rougeL = sample_rougeL.cuda() base_rougeL = base_rougeL.cuda() word_loss = -sample_batch_avg_loss * (base_rougeL - sample_rougeL) reinforce_loss = torch.mean(word_loss) loss = (1 - config.rein) * loss + config.rein * reinforce_loss loss.backward() clip_grad_norm(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.data[0] def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() batch_ds = self.ds_batcher.next_batch() loss = self.train_one_batch(batch, iter, batch_ds) loss = loss.cpu() running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 5 if iter % print_interval == 0: print('steps %d , loss: %f' % (iter, loss)) start = time.time() if iter % 50000 == 0: self.save_model(running_avg_loss, iter)
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.decode_dir, model_name) self._decode_dir = os.path.splitext(self._decode_dir)[0] self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: Path(p).mkdir(parents=True, exist_ok=True) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.pad_id = self.vocab.word2id(PAD_TOKEN) self.start_id = self.vocab.word2id(START_DECODING) self.stop_id = self.vocab.word2id(STOP_DECODING) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def if_already_exists(self, idx): decoded_file = os.path.join(self._rouge_dec_dir, "file.{}.txt".format(idx)) return os.path.isfile(decoded_file) def decode(self, file_id_start, file_id_stop, ami_id='191209'): print("AMI transcription:", ami_id) test_data = load_ami_data(ami_id, 'test') # do this for faster stack CPU machines - to replace those that fail!! idx_list = [i for i in range(file_id_start, file_id_stop)] random.shuffle(idx_list) for idx in idx_list: # for idx in range(file_id_start, file_id_stop): # check if this is written already if self.if_already_exists(idx): print("ID {} already exists".format(idx)) continue # Run beam search to get best Hypothesis best_summary, art_oovs = self.beam_search(test_data, idx) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words # original_abstract_sents = batch.original_abstracts_sents[0] original_abstract_sents = [] write_for_rouge(original_abstract_sents, decoded_words, idx, self._rouge_ref_dir, self._rouge_dec_dir) print("decoded idx = {}".format(idx)) print("Finished decoding idx [{},{})".format(file_id_start, file_id_stop)) # print("Starting ROUGE eval...") # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) # rouge_log(results_dict, self._decode_dir) def beam_search(self, test_data, idx): # batch should have only one example # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ # get_input_from_batch(batch, use_cuda) enc_pack, art_oovs = get_a_batch_decode(test_data, idx, self.vocab, config.beam_size, config.max_enc_steps, config.max_dec_steps, self.start_id, self.stop_id, self.pad_id, sum_type='short', use_cuda=use_cuda) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = enc_pack encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state.forward1(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() # decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder.forward1( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0], art_oovs
class Train(object): def __init__(self): if config.is_hierarchical: raise Exception("Hierarchical PGN-AMI not supported!") self.vocab = Vocab(config.vocab_path, config.vocab_size) self.pad_id = self.vocab.word2id(PAD_TOKEN) self.start_id = self.vocab.word2id(START_DECODING) self.stop_id = self.vocab.word2id(STOP_DECODING) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, ami_data, idx): # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ # get_ami_input_from_batch(batch, use_cuda) # dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ # get_ami_output_from_batch(batch, use_cuda) enc_pack, dec_pack = get_a_batch(ami_data, idx, self.vocab, config.batch_size, config.max_enc_steps, config.max_dec_steps, self.start_id, self.stop_id, self.pad_id, sum_type='short', use_cuda=use_cuda) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() ami_data = load_ami_data('train') valid_data = load_ami_data('valid') # make the training data 100 random.shuffle(valid_data) ami_data.extend(valid_data[:6]) valid_data = valid_data[6:] num_batches = len(ami_data) idx = 0 # validation & stopping best_valid_loss = 1000000000 stop_counter = 0 while iter < n_iters: if idx == 0: print("shuffle training data") random.shuffle(ami_data) loss = self.train_one_batch(ami_data, idx) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 idx += config.batch_size if idx == num_batches: idx = 0 if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format( str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) if iter % config.eval_every == 0: valid_loss = self.run_eval(valid_data) print("valid_loss = {:.5f}".format(valid_loss)) if valid_loss < best_valid_loss: stop_counter = 0 best_valid_loss = valid_loss print("VALID better") else: stop_counter += 1 print( "VALID NOT better, counter = {}".format(stop_counter)) if stop_counter == config.stop_after: print("Stop training") return print("Finished training!") def eval_one_batch(self, eval_data, idx): enc_pack, dec_pack = get_a_batch(eval_data, idx, self.vocab, 1, config.max_enc_steps, config.max_dec_steps, self.start_id, self.stop_id, self.pad_id, sum_type='short', use_cuda=use_cuda) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, dim=1, index=target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data.item() def run_eval(self, eval_data): running_avg_loss, iter = 0, 0 batch_losses = [] num_batches = len(eval_data) print("valid data size = {}".format(num_batches)) for idx in range(num_batches): loss = self.eval_one_batch(eval_data, idx) batch_losses.append(loss) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("#", end="") sys.stdout.flush() print() avg_loss = sum(batch_losses) / len(batch_losses) return avg_loss
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) # print("MODE MUST BE train") # time.sleep(15) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) # self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location= lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() if not config.is_hierarchical: encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) else: stop_id = self.vocab.word2id('.') pad_id = self.vocab.word2id('[PAD]') enc_sent_pos = get_sent_position(enc_batch, stop_id, pad_id) dec_sent_pos = get_sent_position(dec_batch, stop_id, pad_id) encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask, sent_lens, seq_lens2 = \ self.model.encoder(enc_batch, enc_lens, enc_sent_pos) s_t_1, sent_s_t_1 = self.model.reduce_state(encoder_hidden, sent_enc_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing if not config.is_hierarchical: # start = datetime.now() final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # print('NO HIER Time: ',datetime.now() - start) # import pdb; pdb.set_trace() else: # start = datetime.now() max_doc_len = enc_batch.size(1) final_dist, sent_s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, sent_s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, seq_lens2, sent_s_t_1, sent_enc_outputs, sent_enc_feature, sent_enc_padding_mask, sent_lens, max_doc_len, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # print('DO HIER Time: ',datetime.now() - start) # import pdb; pdb.set_trace() target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses/dec_lens_var loss = torch.mean(batch_avg_loss) # start = datatime.now() loss.backward() # print('{} HIER Time: {}'.format(config.is_hierarchical ,datetime.now() - start)) # import pdb; pdb.set_trace() clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin" # with open(data_path, 'rb') as f: # stored_batches = pickle.load(f, encoding="bytes") # print("loaded data: {}".format(data_path)) # num_batches = len(stored_batches) while iter < n_iters: batch = self.batcher.next_batch() # batch_id = iter%num_batches # batch = stored_batches[batch_id] loss = self.train_one_batch(batch) # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 # if iter % 100 == 0: # self.summary_writer.flush() if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) print("Finished training!")
class Evaluate_pg(object): def __init__(self, model_file_path, is_word_level, is_combined, alpha): self.vocab = Vocab(config.vocab_path, config.vocab_size) # self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', # batch_size=config.batch_size, single_pass=True) self.dataset = DailyMailDataset("val", self.vocab) # time.sleep(15) model_name = os.path.basename(model_file_path) self.is_word_level = is_word_level self.is_combined = is_combined self.alpha = alpha eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.model = Model(model_file_path, is_eval=True) def compute_policy_grads_using_rewards(self, sentence_rewards, word_rewards, sentence_losses, word_losses, word_to_sent_ind): if self.is_combined: pg_losses = [[(self.alpha * word_reward + (1 - self.alpha) * sentence_rewards[i][word_to_sent_ind[i][j]]) * word_losses[i][j] for j, word_reward in enumerate(abstract_rewards) if j < len(word_to_sent_ind[i])] for i, abstract_rewards in enumerate(word_rewards)] pg_losses = [sum(pg) for pg in pg_losses] elif self.is_word_level: pg_losses = [[ word_reward * word_losses[i][j] for j, word_reward in enumerate(abstract_rewards) if j < len(word_to_sent_ind[i]) ] for i, abstract_rewards in enumerate(word_rewards)] pg_losses = [sum(pg) for pg in pg_losses] else: pg_losses = [[ rs * sentence_losses[ri][rsi] for rsi, rs in enumerate(r) ] for ri, r in enumerate(sentence_rewards)] pg_losses = [sum(pg) for pg in pg_losses] return pg_losses def compute_pg_loss(self, orig, pred, sentence_losses, split_predictions, word_losses, word_to_sent_ind): sentence_rewards = None word_rewards = None # First compute the rewards if not self.is_word_level or self.is_combined: sentence_rewards = get_sentence_rewards(orig, pred) if self.is_word_level or self.is_combined: word_rewards = get_word_level_rewards(orig, split_predictions) pg_losses = self.compute_policy_grads_using_rewards( sentence_rewards=sentence_rewards, word_rewards=word_rewards, sentence_losses=sentence_losses, word_losses=word_losses, word_to_sent_ind=word_to_sent_ind) return pg_losses def compute_batched_loss(self, word_losses, orig, pred): orig_sum = [] new_pred = [] pred_sum = [] sentence_losses = [] # Convert the original sum as one single string per article for i in range(len(orig)): orig_sum.append(' '.join(map(str, orig[i]))) new_pred.append([]) pred_sum.append([]) sentence_losses.append([]) batch_sent_indices = [] for i in range(len(pred)): sentence = [] sentence = pred[i] losses = word_losses[i] sentence_indices = [] count = 0 while len(sentence) > 0: try: idx = sentence.index(".") except ValueError: idx = len(sentence) sentence_indices.extend([count for _ in range(idx)]) if count > 0: new_pred[i].append(new_pred[i][count - 1] + sentence[:idx + 1]) else: new_pred[i].append(sentence[:idx + 1]) sentence_losses[i].append(sum(losses[:idx + 1])) sentence = sentence[idx + 1:] losses = losses[idx + 1:] count += 1 batch_sent_indices.append(sentence_indices) for i in range(len(pred)): for j in range(len(new_pred[i])): pred_sum[i].append(' '.join(map(str, new_pred[i][j]))) pg_losses = self.compute_pg_loss(orig_sum, pred_sum, sentence_losses, split_predictions=pred, word_losses=word_losses, word_to_sent_ind=batch_sent_indices) return pg_losses def eval_one_batch(self, batch): batch_size = batch.batch_size enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] output_ids = [] y_t_1 = torch.ones(batch_size, dtype=torch.long) * self.vocab.word2id( data.START_DECODING) if config.use_gpu: y_t_1 = y_t_1.cuda() for _ in range(batch_size): output_ids.append([]) step_losses.append([]) for di in range(min(max_dec_len, config.max_dec_steps)): #y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) #NLL if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask # Move on to the next token _, idx = torch.max(final_dist, 1) idx = idx.reshape(batch_size, -1).squeeze() y_t_1 = idx for i, pred in enumerate(y_t_1): if not pred.item() == data.PAD_TOKEN: output_ids[i].append(pred.item()) for i, loss in enumerate(step_loss): step_losses[i].append(step_loss[i]) # Obtain the original and predicted summaries original_abstracts = batch.original_abstracts_sents predicted_abstracts = [ data.outputids2words(ids, self.vocab, None) for ids in output_ids ] # Compute the batched loss batched_losses = self.compute_batched_loss(step_losses, original_abstracts, predicted_abstracts) losses = torch.stack(batched_losses) losses = losses / dec_lens_var loss = torch.mean(losses) return loss.item() def run_eval(self, model_dir, train_iter_id): dataloader = DataLoader(self.dataset, batch_size=config.batch_size, shuffle=False, num_workers=1, collate_fn=create_batch_collate( self.vocab, config.batch_size)) running_avg_loss, iter = 0, 0 start = time.time() # batch = self.batcher.next_batch() pg_losses = [] run_avg_losses = [] for batch in dataloader: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("Iteration:", iter, " loss:", loss, " Running avg loss:", running_avg_loss) iter += 1 print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() pg_losses.append(loss) run_avg_losses.append(running_avg_loss) # Dump val losses pickle.dump( pg_losses, open( os.path.join(model_dir, 'val_pg_losses_{}.p'.format(train_iter_id)), 'wb')) pickle.dump( run_avg_losses, open( os.path.join(model_dir, 'val_run_avg_losses_{}.p'.format(train_iter_id)), 'wb')) return run_avg_losses
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words(output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] write_for_rouge(original_abstract_sents, decoded_words, counter, self._rouge_ref_dir, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec'%(counter, time.time() - start)) start = time.time() batch = self.batcher.next_batch() print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir) def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) if config.use_maxpool_init_ctx: c_t_0 = max_encoder_output dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context = c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in xrange(config.beam_size)] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h =[] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, encoder_outputs, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1) topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in xrange(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in xrange(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].data[0], log_prob=topk_log_probs[i, j].data[0], state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len(results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) self.val_batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) self.scheduler = ExponentialLR(self.optimizer, gamma=0.99) start_iter, start_loss = 0, 0 if model_file_path is not None: #途中から始める場合 state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: #coverageが無しの場合 self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch, iter): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] words = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) words.append(self.vocab.id2word(final_dist[0].argmax().item())) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) # print('step_loss',step_loss) # print('step_loss.size()',step_loss.size()) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) if iter % 100 == 0: print(words) print([self.vocab.id2word(idx.item()) for idx in dec_batch[0]]) print([self.vocab.id2word(idx.item()) for idx in target_batch[0]]) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) # print(loss) # print(type(loss)) # print(loss.data) # print(loss.data.item()) # return loss.data[0] return loss.data.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch, iter) val_loss = None if iter % 100 == 0: val_batch = self.val_batcher.next_batch() val_loss = self.eval_one_batch(val_batch) # print("val_loss",val_loss) self.scheduler.step() print("lr", self.scheduler.get_lr()) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1 if iter % print_interval == 0: if val_loss is not None: print( 'steps %d, seconds for %d batch: %.2f , loss: %f , eval_loss: %f' % (iter, print_interval, time.time() - start, loss, val_loss)) else: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter)
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) '''self.batcher = Batcher(config.oped_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True)''' self.batches = self.read_opeds(config.oped_data_path, self.vocab, config.beam_size) self.model = Model(model_file_path, is_eval=True) def read_opeds(self, config_path, vocab, beam_size): file_list = glob.glob(config_path) #file_list = os.listdir(config_path) batch_list = [] for file in file_list: with open(file, 'rb') as f: text = f.read().lower().decode('utf-8') text = re.sub('\n', '', text) text = re.sub(r'([.,!?()"])', r' \1 ', text).encode('utf-8') print(text) ex = Example(text, [], vocab) # text = text.split() # if len(text) > config.max_enc_steps: # text = text[:config.max_enc_steps] # enc_input = [vocab.word2id(w.decode('utf-8')) for w in text] # assert(sum(enc_input) != 0) enc_input = [ex for _ in range(beam_size)] batch = Batch(enc_input, vocab, beam_size) batch_list.append(batch) print(batch.enc_batch) return batch_list def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 for batch in self.batches: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words write_results(decoded_words, counter, self._rouge_dec_dir) counter += 1 if counter % 1000 == 0: print('%d example in %d sec' % (counter, time.time() - start)) start = time.time() '''print("Decoder has finished reading dataset for single_pass.") print("Now starting ROUGE eval...") results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) rouge_log(results_dict, self._decode_dir)''' def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.START_DECODING)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) if use_cuda: y_t_1 = y_t_1.cuda() all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.STOP_DECODING): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]
class BeamSearch(object): def __init__(self, model_file_path): model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) def sort_beams(self, beams): return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) def decode(self): start = time.time() counter = 0 batch = self.batcher.next_batch() # 新的架构里写在训练的decode部分 while batch is not None: # Run beam search to get best Hypothesis best_summary = self.beam_search(batch) # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = data.outputids2words( output_ids, self.vocab, (batch.art_oovs[0] if config.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.MARK_EOS) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words original_abstract_sents = batch.original_abstracts_sents[0] original_article = batch.original_articles[0] # 英文 # write_for_rouge(original_abstract_sents, decoded_words, counter, # self._rouge_ref_dir, self._rouge_dec_dir) # 中文 self.write_result(original_article, original_abstract_sents, decoded_words, counter) counter += 1 # if counter % 1000 == 0: # print('%d example in %d sec'%(counter, time.time() - start)) # start = time.time() batch = self.batcher.next_batch() # print("Decoder has finished reading dataset for single_pass.") # print("Now starting ROUGE eval...") # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir) # rouge_log(results_dict, self._decode_dir) def write_result(self, original_title, reference_summarization, decoded_words, ex_index): """ Write output to file. Args: reference_sents: list of strings decoded_words: list of strings ex_index: int, the index with which to label the files """ summarization = ''.join(decoded_words) # Write to file result_file = os.path.join(self._decode_dir, "result.txt") with open(result_file, 'w') as f: f.write(original_title + '\t\t' + reference_summarization + '\t\t' + summarization + "\n") print("Wrote example %i to file" % ex_index) def beam_search(self, batch): #batch should have only one example enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \ get_input_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_0 = self.model.reduce_state(encoder_hidden) dec_h, dec_c = s_t_0 # 1 x 2*hidden_size dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() #decoder batch preparation, it has beam_size example initially everything is repeated beams = [ Beam(tokens=[self.vocab.word2id(data.MARK_GO)], log_probs=[0.0], state=(dec_h[0], dec_c[0]), context=c_t_0[0], coverage=(coverage_t_0[0] if config.is_coverage else None)) for _ in range(config.beam_size) ] results = [] steps = 0 while steps < config.max_dec_steps and len(results) < config.beam_size: latest_tokens = [h.latest_token for h in beams] latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.MARK_UNK) \ for t in latest_tokens] y_t_1 = Variable(torch.LongTensor(latest_tokens)) # 向量 if use_cuda: y_t_1 = y_t_1.cuda() all_state_h = [] all_state_c = [] all_context = [] for h in beams: state_h, state_c = h.state all_state_h.append(state_h) all_state_c.append(state_c) all_context.append(h.context) s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) c_t_1 = torch.stack(all_context, 0) coverage_t_1 = None if config.is_coverage: all_coverage = [] for h in beams: all_coverage.append(h.coverage) coverage_t_1 = torch.stack(all_coverage, 0) final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps) log_probs = torch.log(final_dist) topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2) dec_h, dec_c = s_t dec_h = dec_h.squeeze() dec_c = dec_c.squeeze() all_beams = [] num_orig_beams = 1 if steps == 0 else len(beams) for i in range(num_orig_beams): # 对于不同的句子 h = beams[i] state_i = (dec_h[i], dec_c[i]) context_i = c_t[i] coverage_i = (coverage_t[i] if config.is_coverage else None) for j in range(config.beam_size * 2): # for each of the top 2*beam_size hyps: new_beam = h.extend(token=topk_ids[i, j].item(), log_prob=topk_log_probs[i, j].item(), state=state_i, context=context_i, coverage=coverage_i) all_beams.append(new_beam) beams = [] for h in self.sort_beams(all_beams): if h.latest_token == self.vocab.word2id(data.MARK_EOS): if steps >= config.min_dec_steps: results.append(h) else: beams.append(h) if len(beams) == config.beam_size or len( results) == config.beam_size: break steps += 1 if len(results) == 0: results = beams beams_sorted = self.sort_beams(results) return beams_sorted[0]