Exemple #1
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval',
                               config.batch_size, single_pass=True)
        time.sleep(5)
        eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time())))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
Exemple #2
0
    def __init__(self, opt):
        '''
        opt needs to contain:
            - model_file_path
            - n_best
            - max_token_seq_len
        '''
        self.opt = opt
        self.device = torch.device('cuda' if use_cuda else 'cpu')

        print("Max article len", config.max_article_len)
        model = Model(config.vocab_size, config.vocab_size,
                      config.max_article_len)

        checkpoint = torch.load(opt["model_file_path"],
                                map_location=lambda storage, location: storage)

        # model saved as:
        # state = {
        #     'iter': iter,
        #     'transformer_state_dict': self.model.state_dict(),
        #     'optimizer': self.optimizer.state_dict(),
        #     'current_loss': running_avg_loss
        # }

        model.load_state_dict(checkpoint['transformer_state_dict'])

        print('[Info] Trained model state loaded.')

        #model.word_prob_prj = nn.LogSoftmax(dim=1)

        self.model = model.to(self.device)

        self.model.eval()

        self._decode_dir = os.path.join(
            config.log_root,
            'decode_%s' % (opt["model_file_path"].split("/")[-1]))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.batch_size,
                               single_pass=True)

        time.sleep(15)

        print('[Info] Summarizer object created.')
Exemple #3
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)
Exemple #4
0
 def __init__(self, opt, vocab, logger, writer, train_num):
     self.vocab = vocab
     self.train_batcher = Batcher(config.train_data_path,
                                  self.vocab,
                                  mode='train',
                                  batch_size=config.batch_size,
                                  single_pass=False)
     self.test_batcher = Batcher(config.test_data_path,
                                 self.vocab,
                                 mode='eval',
                                 batch_size=config.batch_size,
                                 single_pass=True)
     self.opt = opt
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     self.logger = logger
     self.writer = writer
     self.train_num = train_num
     time.sleep(5)
Exemple #5
0
    def __init__(self, model_file_path, destination_dir):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.encode_data_path,
                               self.vocab,
                               mode='encode',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(5)

        self.output = {}
        self.destination_dir = destination_dir
        self.model = Model(model_file_path, is_eval=True)
    def __init__(self, model_file_path, model_type="stem", load_batcher=True):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        if load_batcher:
            self.batcher = Batcher(config.decode_data_path,
                                   self.vocab,
                                   mode='decode',
                                   batch_size=config.beam_size,
                                   single_pass=True)
            time.sleep(15)
        self.model = Model(model_file_path, is_eval=True)
        self.model_type = model_type
Exemple #7
0
 def __init__(self, opt):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.batcher = Batcher(config.train_data_path,
                            self.vocab,
                            mode='train',
                            batch_size=config.batch_size,
                            single_pass=False)
     self.opt = opt
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(5)
def load_batches_decode():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='decode',
                           batch_size=config.beam_size, single_pass=True)

    batches = [None for _ in range(TEST_DATA_SIZE)]
    for i in range(TEST_DATA_SIZE):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_test.vocab{}.beam{}.pk.bin".format(vocab.size(), config.beam_size), "wb") as f:
        pickle.dump(batches, f)
Exemple #9
0
    def __init__(self, model_file_path):
        self._decode_dir = os.path.join(config.log_root, 'decode_%d' % (int(time.time())))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
Exemple #10
0
    def __init__(self, args, model_name = None):
        self.args = args
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        print(args.vocab_path)
        print(vocab)
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.train_data_path, self.vocab, mode='train',
                               batch_size=args.batch_size, single_pass=False, args=args)
        self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='eval',
                                    batch_size=args.batch_size, single_pass=True, args=args)
        time.sleep(15)

        if model_name is None:
            self.train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        else:
            self.train_dir = os.path.join(config.log_root, model_name)

        if not os.path.exists(self.train_dir):
            os.mkdir(self.train_dir)

        self.model_dir = os.path.join(self.train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Exemple #11
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)

        #train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        train_dir = './train_log'
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Exemple #12
0
def load_batches_train():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)

    TRAIN_DATA_SIZE = 287226
    num_batches = int(TRAIN_DATA_SIZE / config.batch_size)
    batches = [None for _ in range(num_batches)]
    for i in tqdm(range(num_batches)):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_train.vocab{}.batch{}.pk.bin".format(vocab.size(), config.batch_size), "wb") as f:
        pickle.dump(batches, f)
Exemple #13
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.makedirs(train_dir, exist_ok=True)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
Exemple #14
0
 def __init__(self, args):
     self.hparams = hp()
     self.model = Model(self.hparams)
     self.vocab = Vocab(config.vocab_path, self.hparams.vocab_size)
     self.batcher = Batcher(config.train_data_path,
                            self.vocab,
                            mode='train',
                            batch_size=self.hparams.batch_size,
                            single_pass=False)
     self.args = args
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(3)
Exemple #15
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        # print("MODE MUST BE train")
        # time.sleep(15)
        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Exemple #16
0
    def __init__(self, use_elmo=False, finetune_glove=False):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        self.use_elmo = use_elmo
        self.finetune_glove = finetune_glove

        time.sleep(15)

        self.model_dir = os.path.join(train_dir, 'model')

        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
Exemple #17
0
 def __init__(self, model_file_or_model, vocab=None):
     if vocab is None:
         self.vocab = Vocab(config.vocab_path, config.vocab_size)
     else:
         assert isinstance(vocab, Vocab)
         self.vocab = vocab
     self.batcher = Batcher(config.eval_data_path,
                            self.vocab,
                            mode='eval',
                            batch_size=config.batch_size,
                            single_pass=True)
     time.sleep(15)
     if isinstance(model_file_or_model, str):
         self.model = Model(device, model_file_or_model, is_eval=True)
     elif isinstance(model_file_or_model, Model):
         self.model = model_file_or_model
     else:
         raise ValueError("Cannot build model from type %s" %
                          type(model_file_or_model))
Exemple #18
0
    def __init__(self, args, model_file_path, save_path):
        self.args = args
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, save_path,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=args.beam_size,
                               single_pass=True,
                               args=args)
        time.sleep(15)

        self.model = Model(self.vocab, model_file_path, is_eval=True)
Exemple #19
0
    def __init__(self, model_file_path=None):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        if not model_file_path:
            train_dir = os.path.join(config.log_root,
                                     'train_%d' % (int(time.time())))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)
        else:
            train_dir = re.sub('/model/model.*', '', model_file_path)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.create_file_writer(train_dir)
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.ouput_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)

        self.checkpoint_dir = os.path.join(train_dir, 'checkpoints')
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.train_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'train'))
        self.eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'eval'))
    def __init__(self):
        """
        Input:
            vocab_path = "xxx/finished_files/vocab",
            vocab_size = 50000

        Output:
            class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`)
        """
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        """
        Input:
            train_data_path = "xxx/finished_files/chunked/train_*",
            self.vocab: class object,
            mode = 'train', for training,
            batch_size = 8,
            single_pass = False

        Output:
            class object: self.vocab,
                (dicts `_word_to_id` and `_id_to_word`)
        """
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
    def evaluate(self, timestep):
        self.eval_batcher = Batcher(args.eval_data_path,
                                    self.vocab,
                                    mode='train',
                                    batch_size=args.batch_size,
                                    single_pass=True)
        time.sleep(15)
        t1 = time.time()
        batch = self.eval_batcher.next_batch()
        running_avg_loss = 0
        while batch is not None:
            loss = self.model(batch)
            loss = loss / args.max_dec_steps
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss)
            batch = self.eval_batcher.next_batch()

        # Save the evaluation score
        time_spent = time.time() - t1
        print("Evaluation Loss: {}, Time: {}s".format(running_avg_loss,
                                                      time_spent))
        save_running_avg_loss(running_avg_loss, timestep,
                              self.eval_summary_writer)
        sys.stdout.flush()
    def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        sys.stdout.flush()
        self.batcher = Batcher(args.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=args.batch_size,
                               single_pass=False)
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.model = BertLSTMModel(args.hidden_size, self.vocab.size(),
                                   args.max_dec_steps)
        # self.model = Seq2SeqLSTM(args.hidden_size, self.vocab.size(), args.max_dec_steps)
        if use_cuda:
            self.model = self.model.cuda()

        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=args.lr)

        train_logs = os.path.join(args.logs, "train_logs")
        eval_logs = os.path.join(args.logs, "eval_logs")
        self.train_summary_writer = tf.summary.FileWriter(train_logs)
        self.eval_summary_writer = tf.summary.FileWriter(eval_logs)
Exemple #24
0
    def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        self.batcher = Batcher(
            args.decode_data_path,
            self.vocab,
            mode='decode',
            batch_size=1,
            single_pass=True)  # support only 1 item at a time
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.beam_size = args.beam_size
        self.bertClient = BertClient()
        self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size())
        if use_cuda:
            self.decoder = self.decoder.cuda()

        # Prepare the output folder and files
        output_dir = os.path.join(args.logs, "outputs")
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_file = os.path.join(output_dir,
                                   "decoder_{}.txt".format(args.output_name))
        self.file = open(output_file, "w+")
Exemple #25
0
from data_util.data import Vocab
import numpy as np
import json
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
#import nltk
#from nltk.corpus import stopwords

vocab = Vocab(config.vocab_path, config.vocab_size)
batcher = Batcher(config.train_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False)
batches = 1

def google_encoder_metric(abstract_sents, article_sents):
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    rotation = 90

    flatten = lambda l: [item for article_sents in l for item in article_sents]
    article_sentences = flatten(article_sents)

    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])

        #article_sentences = article_sentences[:3]
        #abstract_sents = abstract_sents[:2]
Exemple #26
0
 def __init__(self, data_path, opt, batch_size = 1):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.batcher = Batcher(data_path, self.vocab, mode='eval',
                            batch_size=batch_size, single_pass=True)
     self.opt = opt
    def test(self):
        # time.sleep(5)

        batcher = Batcher(TEST_DATA_PATH, self.vocab, mode='test',batch_size=BATCH_SIZE, single_pass=True)
        batch = batcher.next_batch()
        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()
        count = 0
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch)
            with torch.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            with torch.autograd.no_grad():
                pred_ids = self.beamSearch(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab)
                # print(len(pred_ids[0]))
            for i in range(len(pred_ids)):
                # print('t',pred_ids[i])
                decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i])
                # print(decoded_words)
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)
            # print(decoded_sents)
            batch = batcher.next_batch()
            scores = rouge.get_scores(decoded_sents, ref_sents, avg=True)
            #统计结果
            if count == 1:
                k0_sum = scores[KEYS[0]]
                k1_sum = scores[KEYS[1]]
                k2_sum = scores[KEYS[2]]

            if count > 1:
                k0_sum = dict(Counter(Counter(k0_sum) + Counter(scores[KEYS[0]])))
                k1_sum = dict(Counter(Counter(k1_sum) + Counter(scores[KEYS[1]])))
                k2_sum = dict(Counter(Counter(k2_sum) + Counter(scores[KEYS[2]])))
            if count == 10:
                break

            count += 1


        # print(scores)
        print(KEYS[0], end=' ')
        for k in k0_sum:
            print(k,k0_sum[k] / count,end = ' ')
        print('\n')
        print(KEYS[1],end = ' ')
        for k in k1_sum:
            print(k,k1_sum[k] / count,end = ' ')
        print('\n')
        print(KEYS[2], end=' ')
        for k in k2_sum:
            print(k,k2_sum[k] / count,end = ' ')
        print('\n')