def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval', config.batch_size, single_pass=True) time.sleep(5) eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time()))) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def __init__(self, opt): ''' opt needs to contain: - model_file_path - n_best - max_token_seq_len ''' self.opt = opt self.device = torch.device('cuda' if use_cuda else 'cpu') print("Max article len", config.max_article_len) model = Model(config.vocab_size, config.vocab_size, config.max_article_len) checkpoint = torch.load(opt["model_file_path"], map_location=lambda storage, location: storage) # model saved as: # state = { # 'iter': iter, # 'transformer_state_dict': self.model.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'current_loss': running_avg_loss # } model.load_state_dict(checkpoint['transformer_state_dict']) print('[Info] Trained model state loaded.') #model.word_prob_prj = nn.LogSoftmax(dim=1) self.model = model.to(self.device) self.model.eval() self._decode_dir = os.path.join( config.log_root, 'decode_%s' % (opt["model_file_path"].split("/")[-1])) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.batch_size, single_pass=True) time.sleep(15) print('[Info] Summarizer object created.')
def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir)
def __init__(self, opt, vocab, logger, writer, train_num): self.vocab = vocab self.train_batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.test_batcher = Batcher(config.test_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) self.opt = opt self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) self.logger = logger self.writer = writer self.train_num = train_num time.sleep(5)
def __init__(self, model_file_path, destination_dir): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.encode_data_path, self.vocab, mode='encode', batch_size=config.batch_size, single_pass=True) time.sleep(5) self.output = {} self.destination_dir = destination_dir self.model = Model(model_file_path, is_eval=True)
def __init__(self, model_file_path, model_type="stem", load_batcher=True): self.vocab = Vocab(config.vocab_path, config.vocab_size) if load_batcher: self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True) self.model_type = model_type
def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.opt = opt self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) time.sleep(5)
def load_batches_decode(): vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.decode_data_path, vocab, mode='decode', batch_size=config.beam_size, single_pass=True) batches = [None for _ in range(TEST_DATA_SIZE)] for i in range(TEST_DATA_SIZE): batch = batcher.next_batch() batches[i] = batch with open("lib/data/batches_test.vocab{}.beam{}.pk.bin".format(vocab.size(), config.beam_size), "wb") as f: pickle.dump(batches, f)
def __init__(self, model_file_path): self._decode_dir = os.path.join(config.log_root, 'decode_%d' % (int(time.time()))) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode', batch_size=config.beam_size, single_pass=True) time.sleep(15) self.model = Model(model_file_path, is_eval=True)
def __init__(self, args, model_name = None): self.args = args vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path print(args.vocab_path) print(vocab) self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file) self.batcher = Batcher(args.train_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=False, args=args) self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='eval', batch_size=args.batch_size, single_pass=True, args=args) time.sleep(15) if model_name is None: self.train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) else: self.train_dir = os.path.join(config.log_root, model_name) if not os.path.exists(self.train_dir): os.mkdir(self.train_dir) self.model_dir = os.path.join(self.train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) #train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) train_dir = './train_log' if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def load_batches_train(): vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.decode_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False) TRAIN_DATA_SIZE = 287226 num_batches = int(TRAIN_DATA_SIZE / config.batch_size) batches = [None for _ in range(num_batches)] for i in tqdm(range(num_batches)): batch = batcher.next_batch() batches[i] = batch with open("lib/data/batches_train.vocab{}.batch{}.pk.bin".format(vocab.size(), config.batch_size), "wb") as f: pickle.dump(batches, f)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.makedirs(train_dir, exist_ok=True) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def __init__(self, args): self.hparams = hp() self.model = Model(self.hparams) self.vocab = Vocab(config.vocab_path, self.hparams.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=self.hparams.batch_size, single_pass=False) self.args = args self.start_id = self.vocab.word2id(data.START_DECODING) self.end_id = self.vocab.word2id(data.STOP_DECODING) self.pad_id = self.vocab.word2id(data.PAD_TOKEN) self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN) time.sleep(3)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) # print("MODE MUST BE train") # time.sleep(15) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir)
def __init__(self, use_elmo=False, finetune_glove=False): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) self.use_elmo = use_elmo self.finetune_glove = finetune_glove time.sleep(15) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
def __init__(self, model_file_or_model, vocab=None): if vocab is None: self.vocab = Vocab(config.vocab_path, config.vocab_size) else: assert isinstance(vocab, Vocab) self.vocab = vocab self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) if isinstance(model_file_or_model, str): self.model = Model(device, model_file_or_model, is_eval=True) elif isinstance(model_file_or_model, Model): self.model = model_file_or_model else: raise ValueError("Cannot build model from type %s" % type(model_file_or_model))
def __init__(self, args, model_file_path, save_path): self.args = args model_name = os.path.basename(model_file_path) self._decode_dir = os.path.join(config.log_root, save_path, 'decode_%s' % (model_name)) self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref') self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir') for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]: if not os.path.exists(p): os.mkdir(p) vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file) self.batcher = Batcher(args.decode_data_path, self.vocab, mode='decode', batch_size=args.beam_size, single_pass=True, args=args) time.sleep(15) self.model = Model(self.vocab, model_file_path, is_eval=True)
def __init__(self, model_file_path=None): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) if not model_file_path: train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) else: train_dir = re.sub('/model/model.*', '', model_file_path) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.create_file_writer(train_dir)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.ouput_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.makedirs(train_dir) self.checkpoint_dir = os.path.join(train_dir, 'checkpoints') if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'train')) self.eval_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'eval'))
def __init__(self): """ Input: vocab_path = "xxx/finished_files/vocab", vocab_size = 50000 Output: class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`) """ self.vocab = Vocab(config.vocab_path, config.vocab_size) """ Input: train_data_path = "xxx/finished_files/chunked/train_*", self.vocab: class object, mode = 'train', for training, batch_size = 8, single_pass = False Output: class object: self.vocab, (dicts `_word_to_id` and `_id_to_word`) """ self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def evaluate(self, timestep): self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=True) time.sleep(15) t1 = time.time() batch = self.eval_batcher.next_batch() running_avg_loss = 0 while batch is not None: loss = self.model(batch) loss = loss / args.max_dec_steps running_avg_loss = calc_running_avg_loss(loss, running_avg_loss) batch = self.eval_batcher.next_batch() # Save the evaluation score time_spent = time.time() - t1 print("Evaluation Loss: {}, Time: {}s".format(running_avg_loss, time_spent)) save_running_avg_loss(running_avg_loss, timestep, self.eval_summary_writer) sys.stdout.flush()
def __init__(self): self.vocab = Vocab(args.vocab_path, args.vocab_size) sys.stdout.flush() self.batcher = Batcher(args.train_data_path, self.vocab, mode='train', batch_size=args.batch_size, single_pass=False) time.sleep(15) vocab_size = self.vocab.size() self.model = BertLSTMModel(args.hidden_size, self.vocab.size(), args.max_dec_steps) # self.model = Seq2SeqLSTM(args.hidden_size, self.vocab.size(), args.max_dec_steps) if use_cuda: self.model = self.model.cuda() self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr) train_logs = os.path.join(args.logs, "train_logs") eval_logs = os.path.join(args.logs, "eval_logs") self.train_summary_writer = tf.summary.FileWriter(train_logs) self.eval_summary_writer = tf.summary.FileWriter(eval_logs)
def __init__(self): self.vocab = Vocab(args.vocab_path, args.vocab_size) self.batcher = Batcher( args.decode_data_path, self.vocab, mode='decode', batch_size=1, single_pass=True) # support only 1 item at a time time.sleep(15) vocab_size = self.vocab.size() self.beam_size = args.beam_size self.bertClient = BertClient() self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size()) if use_cuda: self.decoder = self.decoder.cuda() # Prepare the output folder and files output_dir = os.path.join(args.logs, "outputs") if not os.path.exists(output_dir): os.mkdir(output_dir) output_file = os.path.join(output_dir, "decoder_{}.txt".format(args.output_name)) self.file = open(output_file, "w+")
from data_util.data import Vocab import numpy as np import json import tensorflow as tf import tensorflow_hub as hub import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import re import seaborn as sns #import nltk #from nltk.corpus import stopwords vocab = Vocab(config.vocab_path, config.vocab_size) batcher = Batcher(config.train_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False) batches = 1 def google_encoder_metric(abstract_sents, article_sents): embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") rotation = 90 flatten = lambda l: [item for article_sents in l for item in article_sents] article_sentences = flatten(article_sents) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) #article_sentences = article_sentences[:3] #abstract_sents = abstract_sents[:2]
def __init__(self, data_path, opt, batch_size = 1): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size, single_pass=True) self.opt = opt
def test(self): # time.sleep(5) batcher = Batcher(TEST_DATA_PATH, self.vocab, mode='test',batch_size=BATCH_SIZE, single_pass=True) batch = batcher.next_batch() decoded_sents = [] ref_sents = [] article_sents = [] rouge = Rouge() count = 0 while batch is not None: enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch) with torch.autograd.no_grad(): enc_batch = self.model.embeds(enc_batch) enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) with torch.autograd.no_grad(): pred_ids = self.beamSearch(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab) # print(len(pred_ids[0])) for i in range(len(pred_ids)): # print('t',pred_ids[i]) decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i]) # print(decoded_words) if len(decoded_words) < 2: decoded_words = "xxx" else: decoded_words = " ".join(decoded_words) decoded_sents.append(decoded_words) abstract = batch.original_abstracts[i] article = batch.original_articles[i] ref_sents.append(abstract) article_sents.append(article) # print(decoded_sents) batch = batcher.next_batch() scores = rouge.get_scores(decoded_sents, ref_sents, avg=True) #统计结果 if count == 1: k0_sum = scores[KEYS[0]] k1_sum = scores[KEYS[1]] k2_sum = scores[KEYS[2]] if count > 1: k0_sum = dict(Counter(Counter(k0_sum) + Counter(scores[KEYS[0]]))) k1_sum = dict(Counter(Counter(k1_sum) + Counter(scores[KEYS[1]]))) k2_sum = dict(Counter(Counter(k2_sum) + Counter(scores[KEYS[2]]))) if count == 10: break count += 1 # print(scores) print(KEYS[0], end=' ') for k in k0_sum: print(k,k0_sum[k] / count,end = ' ') print('\n') print(KEYS[1],end = ' ') for k in k1_sum: print(k,k1_sum[k] / count,end = ' ') print('\n') print(KEYS[2], end=' ') for k in k2_sum: print(k,k2_sum[k] / count,end = ' ') print('\n')