def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        self.batcher = Batcher(
            args.decode_data_path,
            self.vocab,
            mode='decode',
            batch_size=1,
            single_pass=True)  # support only 1 item at a time
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.beam_size = args.beam_size
        # self.bertClient = BertClient()
        self.encoder = EncoderLSTM(args.hidden_size, self.vocab.size())
        self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size())
        if use_cuda:
            self.encoder = self.encoder.cuda()
            self.decoder = self.decoder.cuda()

        # Prepare the output folder and files
        output_dir = os.path.join(args.logs, "outputs")
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_file = os.path.join(output_dir,
                                   "decoder_{}.txt".format(args.output_name))
        self.file = open(output_file, "w+")
Example #2
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        #         self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        #         self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        #         for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
        #             if not os.path.exists(p):
        #                 os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        #         self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
        #                                batch_size=config.beam_size, single_pass=True)
        decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/decode_file.txt"
        # decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/data_file.txt"
        self.batcher = Batcher(data_path=decode_data_path,
                               vocab=self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)

        # time.sleep(15)
        time.sleep(2)

        self.model = Model(model_file_path, is_eval=True)
Example #3
0
    def __init__(self, train_dir=None, eval_dir=None, vocab=None, vectors=None):
        self.vectors = vectors
        if vocab is None:
            self.vocab = Vocab(config.vocab_path, config.vocab_size)
        else:
            self.vocab = vocab

        print(self.vocab)
        self.batcher_train = Batcher(config.train_data_path, self.vocab, mode='train',
                                     batch_size=config.batch_size, single_pass=False)
        time.sleep(15)
        self.batcher_eval = Batcher(config.eval_data_path, self.vocab, mode='eval',
                                    batch_size=config.batch_size, single_pass=True)
        time.sleep(15)

        cur_time = int(time.time())
        if train_dir is None:
            train_dir = os.path.join(config.log_root, 'train_%d' % (cur_time))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)

        if eval_dir is None:
            eval_dir = os.path.join(config.log_root, 'eval_%s' % (cur_time))
            if not os.path.exists(eval_dir):
                os.mkdir(eval_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer_train = writer.FileWriter(train_dir)
        self.summary_writer_eval = writer.FileWriter(eval_dir)
Example #4
0
    def __init__(self, is_word_level=False, is_combined=False, alpha=0.3):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
        #                        batch_size=config.batch_size, single_pass=False)
        self.dataset = DailyMailDataset("train", self.vocab)
        #time.sleep(15)

        self.is_word_level = is_word_level
        self.is_combined = is_combined
        self.alpha = alpha

        if is_word_level:
            print("Using Word Level Policy Gradient")
        elif is_combined:
            print("Using Combined Policy Gradient w/ alpha = ", alpha)
        else:
            print("Using Sentence Level Policy Gradient")

        train_dir = './train_dumps'
        # train_dir = './train_dumps'
        if not os.path.exists(train_dir):
            #print('create dict')
            os.mkdir(train_dir)

        self.model_dir = os.path.join(
            train_dir, 'dumps_model_{:%m_%d_%H_%M}'.format(datetime.now()))
        if not os.path.exists(self.model_dir):
            #print('create folder')
            os.mkdir(self.model_dir)
Example #5
0
    def __init__(self, args, model_name=None):
        self.args = args
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=args.batch_size,
                               single_pass=False,
                               args=args)
        self.eval_batcher = Batcher(args.eval_data_path,
                                    self.vocab,
                                    mode='eval',
                                    batch_size=args.batch_size,
                                    single_pass=True,
                                    args=args)
        time.sleep(15)

        if model_name is None:
            self.train_dir = os.path.join(config.log_root,
                                          'train_%d' % (int(time.time())))
        else:
            self.train_dir = os.path.join(config.log_root, model_name)

        if not os.path.exists(self.train_dir):
            os.mkdir(self.train_dir)

        self.model_dir = os.path.join(self.train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)
        self.val_batcher = Batcher(config.eval_data_path,
                                   self.vocab,
                                   mode='eval',
                                   batch_size=config.batch_size,
                                   single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
Example #7
0
    def __init__(self, data_path, opt, batch_size=config.batch_size):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size,
                               single_pass=True)

        self.opt =opt
        time.sleep(5)
 def __init__(self):
     self.vocab = Vocab(VOCAB_PATH, VOCAB_SIZE)
     self.batcher = Batcher(TRAIN_DATA_PATH, self.vocab, mode = 'train',batch_size = BATCH_SIZE, single_pass = False)
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     self.model = MyModel().to(DEVICE)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR)
Example #9
0
 def __init__(self, opt):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                            batch_size=config.batch_size, single_pass=False)
     self.opt = opt
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(5)
Example #10
0
 def __init__(self):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.train_batcher = Batcher(config.train_data_path,
                                  self.vocab,
                                  hps=config.hps,
                                  single_pass=False)
     self.val_batcher = Batcher(config.eval_data_path,
                                self.vocab,
                                hps=config.hps,
                                single_pass=False)
Example #11
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval',
                               config.batch_size, single_pass=True)
        time.sleep(5)
        eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time())))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
Example #12
0
    def __init__(self, opt):
        '''
        opt needs to contain:
            - model_file_path
            - n_best
            - max_token_seq_len
        '''
        self.opt = opt
        self.device = torch.device('cuda' if use_cuda else 'cpu')

        print("Max article len", config.max_article_len)
        model = Model(config.vocab_size, config.vocab_size,
                      config.max_article_len)

        checkpoint = torch.load(opt["model_file_path"],
                                map_location=lambda storage, location: storage)

        # model saved as:
        # state = {
        #     'iter': iter,
        #     'transformer_state_dict': self.model.state_dict(),
        #     'optimizer': self.optimizer.state_dict(),
        #     'current_loss': running_avg_loss
        # }

        model.load_state_dict(checkpoint['transformer_state_dict'])

        print('[Info] Trained model state loaded.')

        #model.word_prob_prj = nn.LogSoftmax(dim=1)

        self.model = model.to(self.device)

        self.model.eval()

        self._decode_dir = os.path.join(
            config.log_root,
            'decode_%s' % (opt["model_file_path"].split("/")[-1]))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.batch_size,
                               single_pass=True)

        time.sleep(15)

        print('[Info] Summarizer object created.')
Example #13
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)
Example #14
0
    def __init__(self, model_file_path, destination_dir):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.encode_data_path,
                               self.vocab,
                               mode='encode',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(5)

        self.output = {}
        self.destination_dir = destination_dir
        self.model = Model(model_file_path, is_eval=True)
Example #15
0
def load_batches_decode():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='decode',
                           batch_size=config.beam_size, single_pass=True)

    batches = [None for _ in range(TEST_DATA_SIZE)]
    for i in range(TEST_DATA_SIZE):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_test.vocab{}.beam{}.pk.bin".format(vocab.size(), config.beam_size), "wb") as f:
        pickle.dump(batches, f)
Example #16
0
    def __init__(self, model_file_path):
        self._decode_dir = os.path.join(config.log_root, 'decode_%d' % (int(time.time())))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
Example #17
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)

        #train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        train_dir = './train_log'
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Example #18
0
 def __init__(self, args):
     self.hparams = hp()
     self.model = Model(self.hparams)
     self.vocab = Vocab(config.vocab_path, self.hparams.vocab_size)
     self.batcher = Batcher(config.train_data_path,
                            self.vocab,
                            mode='train',
                            batch_size=self.hparams.batch_size,
                            single_pass=False)
     self.args = args
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(3)
Example #19
0
def load_batches_train():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)

    TRAIN_DATA_SIZE = 287226
    num_batches = int(TRAIN_DATA_SIZE / config.batch_size)
    batches = [None for _ in range(num_batches)]
    for i in tqdm(range(num_batches)):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_train.vocab{}.batch{}.pk.bin".format(vocab.size(), config.batch_size), "wb") as f:
        pickle.dump(batches, f)
Example #20
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.decode_dir, model_name)
        self._decode_dir = os.path.splitext(self._decode_dir)[0]
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            Path(p).mkdir(parents=True, exist_ok=True)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.pad_id = self.vocab.word2id(PAD_TOKEN)
        self.start_id = self.vocab.word2id(START_DECODING)
        self.stop_id = self.vocab.word2id(STOP_DECODING)

        self.model = Model(model_file_path, is_eval=True)
Example #21
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        # print("MODE MUST BE train")
        # time.sleep(15)
        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Example #22
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.decode_dir, model_name)
        self._decode_dir = os.path.splitext(self._decode_dir)[0]
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            Path(p).mkdir(parents=True, exist_ok=True)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
        #                        batch_size=config.beam_size, single_pass=True)
        # time.sleep(15)
        self.get_batches(config.decode_pk_path)

        self.model = Model(model_file_path, is_eval=True)
Example #23
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        '''self.batcher = Batcher(config.oped_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)'''
        self.batches = self.read_opeds(config.oped_data_path, self.vocab,
                                       config.beam_size)

        self.model = Model(model_file_path, is_eval=True)
Example #24
0
    def __init__(self, model_file_path, is_word_level, is_combined, alpha):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
        #                        batch_size=config.batch_size, single_pass=True)
        self.dataset = DailyMailDataset("val", self.vocab)
        # time.sleep(15)
        model_name = os.path.basename(model_file_path)

        self.is_word_level = is_word_level
        self.is_combined = is_combined
        self.alpha = alpha

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
Example #25
0
    def __init__(self, use_elmo=False, finetune_glove=False):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        self.use_elmo = use_elmo
        self.finetune_glove = finetune_glove

        time.sleep(15)

        self.model_dir = os.path.join(train_dir, 'model')

        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
Example #26
0
    def __init__(self):
        if config.is_hierarchical:
            raise Exception("Hierarchical PGN-AMI not supported!")

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.pad_id = self.vocab.word2id(PAD_TOKEN)
        self.start_id = self.vocab.word2id(START_DECODING)
        self.stop_id = self.vocab.word2id(STOP_DECODING)

        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Example #27
0
 def __init__(self, model_file_or_model, vocab=None):
     if vocab is None:
         self.vocab = Vocab(config.vocab_path, config.vocab_size)
     else:
         assert isinstance(vocab, Vocab)
         self.vocab = vocab
     self.batcher = Batcher(config.eval_data_path,
                            self.vocab,
                            mode='eval',
                            batch_size=config.batch_size,
                            single_pass=True)
     time.sleep(15)
     if isinstance(model_file_or_model, str):
         self.model = Model(device, model_file_or_model, is_eval=True)
     elif isinstance(model_file_or_model, Model):
         self.model = model_file_or_model
     else:
         raise ValueError("Cannot build model from type %s" %
                          type(model_file_or_model))
Example #28
0
def train_action(opt, logger, writer, train_num):
    try:
        opt.rl_weight = 1 - opt.mle_weight

        if opt.load_model:
            opt.load_model = "/%s/%s" % (opt.word_emb_type, opt.load_model)

        logger.info(u'------Training Setting--------')

        logger.info("Traing Type :%s" % (config.data_type))
        if opt.train_mle == True:
            logger.info("Training mle: %s, mle weight: %.2f" %
                        (opt.train_mle, opt.mle_weight))

        if opt.train_rl == True:
            logger.info("Training rl: %s, rl weight: %.2f \n" %
                        (opt.train_rl, opt.rl_weight))

        if opt.word_emb_type == 'bert': config.emb_dim = 768
        if opt.pre_train_emb:
            logger.info('use pre_train_%s vocab_size %s \n' %
                        (opt.word_emb_type, config.vocab_size))

        else:
            logger.info('use %s vocab_size %s \n' %
                        (opt.word_emb_type, config.vocab_size))

        logger.info("intra_encoder: %s intra_decoder: %s \n" %
                    (config.intra_encoder, config.intra_decoder))
        if opt.word_emb_type in ['word2Vec', 'glove', 'FastText']:
            config.vocab_path = config.Data_path + "Embedding/%s/word.vocab" % (
                opt.word_emb_type)
            # config.vocab_size = len(open(config.vocab_path).readlines())
            vocab = Vocab(config.vocab_path, config.vocab_size)
        train_processor = Train(opt, vocab, logger, writer, train_num)
        train_processor.trainIters()
    except KeyError as e:
        print(e)
        traceback = sys.exc_info()[2]
        logger.error(sys.exc_info())
        logger.error(traceback.tb_lineno)
        logger.error(e)
    logger.info(u'------Training END--------')
Example #29
0
    def __init__(self, args, model_file_path, save_path):
        self.args = args
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, save_path,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=args.beam_size,
                               single_pass=True,
                               args=args)
        time.sleep(15)

        self.model = Model(self.vocab, model_file_path, is_eval=True)
Example #30
0
    def __init__(self, model_file_path=None):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        if not model_file_path:
            train_dir = os.path.join(config.log_root,
                                     'train_%d' % (int(time.time())))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)
        else:
            train_dir = re.sub('/model/model.*', '', model_file_path)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.create_file_writer(train_dir)