Esempio n. 1
0
    def eval_predict(self, src_filename, dest_filename, args,
                     model=None, remove_existed=False):
        args_dict = vars(args)
        args_dict['batch_size'] = args_dict['eval_batch_size']
        args = Munch(args_dict)
        loader = KeyphraseDataLoader(data_source=src_filename,
                                     vocab2id=self.vocab2id,
                                     mode='inference',
                                     args=args)
        if os.path.exists(dest_filename):
            print('destination filename {} existed'.format(dest_filename))
            if remove_existed:
                os.remove(dest_filename)
        if model is not None:
            model.eval()
            self.beam_searcher = TransformerBeamSearch(model=model,
                                                       beam_size=self.beam_size,
                                                       max_target_len=self.max_target_len,
                                                       id2vocab=self.id2vocab,
                                                       bos_idx=self.vocab2id[BOS_WORD],
                                                       args=self.config)

        for batch in loader:
            with torch.no_grad():
                batch_result = self.beam_searcher.beam_search(batch, delimiter=None)
                final_result = []
                assert len(batch_result) == len(batch[RAW_BATCH])
                for item_input, item_output in zip(batch[RAW_BATCH], batch_result):
                    item_input['pred_keyphrases'] = item_output
                    final_result.append(item_input)
                append_jsonlines(dest_filename, final_result)
Esempio n. 2
0
    def predict(self, text_list, batch_size=10, delimiter=None, tokenized=False):
        """

        :param text_list:
        :param batch_size:
        :param delimiter:
        :param tokenized:
        :return:
        """
        # eval mode closes dropout, triggers auto regression in decoding stage
        self.model.eval()
        if len(text_list) < batch_size:
            batch_size = len(text_list)

        if tokenized:
            text_list = [{TOKENS: i} for i in text_list]
        else:
            text_list = [{TOKENS: token_char_tokenize(i)} for i in text_list]
        args = Munch({'batch_size': batch_size, **self.config._asdict(), **self.pred_base_config})
        loader = KeyphraseDataLoader(data_source=text_list,
                                     vocab2id=self.vocab2id,
                                     mode=INFERENCE_MODE,
                                     args=args)
        result = []
        for batch in loader:
            with torch.no_grad():
                result.extend(self.beam_searcher.beam_search(batch, delimiter=delimiter))
        return result
Esempio n. 3
0
    def __init__(self, args, model):
        torch.manual_seed(0)
        torch.autograd.set_detect_anomaly(True)
        self.args = args
        self.vocab2id = load_vocab(self.args.vocab_path, self.args.vocab_size)

        self.model = model
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        if args.train_parallel:
            self.model = nn.DataParallel(self.model)
        self.loss_func = nn.NLLLoss(ignore_index=self.vocab2id[PAD_WORD])
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.args.learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   self.args.schedule_step,
                                                   self.args.schedule_gamma)
        self.logger = get_logger('train')
        self.train_loader = KeyphraseDataLoader(
            data_source=self.args.train_filename,
            vocab2id=self.vocab2id,
            mode='train',
            args=args)
        if self.args.train_from:
            self.dest_dir = os.path.dirname(self.args.train_from) + '/'
        else:
            timemark = time.strftime('%Y%m%d-%H%M%S',
                                     time.localtime(time.time()))
            self.dest_dir = os.path.join(
                self.args.dest_base_dir,
                self.args.exp_name + '-' + timemark) + '/'
            os.mkdir(self.dest_dir)

        fh = logging.FileHandler(os.path.join(self.dest_dir, args.logfile))
        fh.setLevel(logging.INFO)
        fh.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))
        self.logger.addHandler(fh)

        if not self.args.tensorboard_dir:
            tensorboard_dir = self.dest_dir + 'logs/'
        else:
            tensorboard_dir = self.args.tensorboard_dir
        self.writer = SummaryWriter(tensorboard_dir)
        self.eval_topn = (5, 10)
        self.macro_evaluator = KeyphraseEvaluator(self.eval_topn, 'macro',
                                                  args.token_field,
                                                  args.keyphrase_field)
        self.micro_evaluator = KeyphraseEvaluator(self.eval_topn, 'micro',
                                                  args.token_field,
                                                  args.keyphrase_field)
        self.best_f1 = None
        self.best_step = 0
        self.not_update_count = 0
Esempio n. 4
0
 def predict(self, text_list, batch_size, delimiter=None):
     self.model.eval()
     if len(text_list) < batch_size:
         batch_size = len(text_list)
     args = Munch({'batch_size': batch_size, **self.pred_base_config})
     text_list = [{TOKENS: token_char_tokenize(i)} for i in text_list]
     loader = KeyphraseDataLoader(data_source=text_list,
                                  vocab2id=self.vocab2id,
                                  args=args,
                                  mode='inference')
     result = []
     for batch in loader:
         with torch.no_grad():
             result.extend(self.beam_searcher.beam_search(batch, delimiter=delimiter))
     return result