Esempio n. 1
0
    def evaluate(self, step):
        predictor = CopyRnnPredictor(model_info={'model': self.model, 'config': self.args},
                                     vocab_info=self.vocab2id,
                                     beam_size=self.args.beam_size,
                                     max_target_len=self.args.max_target_len,
                                     max_src_length=self.args.max_src_len)

        def pred_callback(stage):
            if stage == 'valid':
                src_filename = self.args.valid_filename
                dest_filename = self.dest_dir + self.get_basename(self.args.valid_filename)
            elif stage == 'test':
                src_filename = self.args.test_filename
                dest_filename = self.dest_dir + self.get_basename(self.args.test_filename)
            else:
                raise ValueError('stage name error, must be in `valid` and `test`')
            dest_filename += '.batch_{}.pred.jsonl'.format(step)
            def predict_func():
                predictor.eval_predict(src_filename=src_filename,
                                       dest_filename=dest_filename,
                                       args=self.args,
                                       model=self.model,
                                       remove_existed=True)

            return predict_func

        valid_statistics = self.evaluate_stage(step, 'valid', pred_callback('valid'))
        test_statistics = self.evaluate_stage(step, 'test', pred_callback('test'))
        total_statistics = {**valid_statistics, **test_statistics}

        eval_filename = self.dest_dir + self.args.exp_name + '.batch_{}.eval.json'.format(step)
        write_json(eval_filename, total_statistics)
        return valid_statistics['valid_macro'][self.eval_topn[-1]]['f1']
Esempio n. 2
0
    def build_index(self, dest_filename):
        indexer = hnswlib.Index(space='cosine', dim=self.dim)
        paper_id_list = []
        vector_list = []
        idx2paper_id = {}
        vector_id_list = []
        start = time.time()

        for vec_idx, (paper_id, vector) in enumerate(self.read_paper_item()):
            vector_id_list.append(vec_idx)
            paper_id_list.append(paper_id)
            vector_list.append(vector)
            idx2paper_id[vec_idx] = paper_id

        duration = time.time() - start
        msg_tmpl = 'vector loading completed time consumed {:.0f}min {:.2f}sec'
        print(msg_tmpl.format(duration // 60, duration % 60))
        num_elements = len(paper_id_list)
        indexer.init_index(max_elements=num_elements,
                           ef_construction=200,
                           M=100)
        # hnswlib only supports number based index,
        # therefore, mapper from number id to paper id is required to be saved
        # vector_data = np.array(vector_list)
        indexer.add_items(vector_list, vector_id_list)
        indexer.set_ef(500)
        indexer.save_index(dest_filename)
        write_json(dest_filename + '.map', idx2paper_id)
Esempio n. 3
0
    def train_func(self):
        model = CopyRnnTF(self.args, self.vocab2id)
        dataloader = KeyphraseDataLoader(data_source=self.args.train_filename,
                                         vocab2id=self.vocab2id,
                                         mode='train',
                                         args=self.args)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.args.learning_rate)

        @tf.function
        def train_step(x, x_with_oov, x_len, target):
            batch_size = x.shape[0]
            dec_len = self.args.max_target_len
            with tf.GradientTape() as tape:
                loss = 0
                probs, enc_output, prev_h, prev_c = model(
                    x, x_with_oov, x_len, tf.constant(0), target[:, :-1], None,
                    None, tf.convert_to_tensor(batch_size), dec_len)
                for batch_idx in range(batch_size):
                    dec_target = target[batch_idx, 1:]
                    target_idx = tf.one_hot(dec_target, self.total_vocab_size)
                    dec_step_loss = -tf.reduce_sum(
                        probs[batch_idx, :] * target_idx, axis=1)
                    mask = tf.cast(dec_target != self.pad_idx,
                                   dtype=tf.float32)

                    dec_step_loss *= mask
                    loss += tf.reduce_sum(dec_step_loss) / tf.reduce_sum(mask)

                loss /= batch_size
            grads = tape.gradient(loss, model.trainable_variables)
            grads = [(tf.clip_by_value(grad, -0.1, 0.1)) for grad in grads]
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            return loss

        step_idx = 0
        for epoch in range(self.args.epochs):
            for batch in dataloader:
                loss = train_step(batch[TOKENS], batch[TOKENS_OOV],
                                  batch[TOKENS_LENS], batch[TARGET])
                with self.writer.as_default():
                    tf.summary.scalar('loss', loss, step=step_idx)
                step_idx += 1
                if not step_idx % self.args.save_model_step:
                    model_basename = self.dest_base_dir + '/{}_step{}'.format(
                        self.exp_name, step_idx)
                    # write_json(model_basename + '.json', vars(self.args))
                    # beam_search_graph = model.beam_search.get_concrete_function(
                    #     x=tf.TensorSpec(shape=[None, self.args.max_src_len], dtype=tf.int64),
                    #     x_with_oov=tf.TensorSpec(shape=[None, self.args.max_src_len], dtype=tf.int64),
                    #     x_len=tf.TensorSpec(shape=[None], dtype=tf.int64),
                    #     batch_size=tf.TensorSpec(shape=[None], dtype=tf.int64)
                    # )
                    # tf.saved_model.save(model, model_basename, signatures=beam_search_graph)
                    model.save_weights(model_basename + '.ckpt',
                                       save_format='tf')
                    write_json(model_basename + '.json', vars(self.args))
                    f1 = self.evaluate(model, step_idx)
                    self.logger.info('step {}, f1 {}'.format(step_idx, f1))
Esempio n. 4
0
    def train_func(self):
        # loss_fct = MarginRankingLoss(margin=1, reduction='mean')
        loss_fct = NLLLoss(reduction='mean')
        optimizer = AdamW(self.model.parameters(), self.args.learning_rate)
        step = 0
        # cos = nn.CosineSimilarity(dim=1)
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.args.scheduler_step,
            gamma=self.args.scheduler_gamma)
        accumulate_step = 0

        for epoch in range(1, self.args.epoch + 1):
            for batch in self.loader:
                probs = self.get_probs(batch)
                batch_size = probs.size(0)

                true_idx = torch.zeros(batch_size, dtype=torch.long)
                if torch.cuda.is_available():
                    true_idx = true_idx.cuda()
                loss = loss_fct(probs, true_idx)
                loss.backward()

                self.writer.add_scalar('loss', loss, step)

                stop_scheduler_step = self.args.scheduler_step * 80

                if accumulate_step % self.args.gradient_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    if self.args.scheduler_lr and step <= stop_scheduler_step:
                        scheduler.step()
                    accumulate_step = 0

                step += 1
                if step % self.args.save_model_step == 0:
                    model_basename = self.args.dest_base_dir + self.args.exp_name
                    model_basename += '_epoch_{}_step_{}'.format(epoch, step)
                    torch.save(self.model.state_dict(),
                               model_basename + '.model')
                    write_json(model_basename + '.json', vars(self.args))
                    ret = self.evaluate(model_basename, step)
                    self.writer.add_scalar('accuracy', ret, step)
                    # self.writer.add_scalar('recall', ret['recall'], step)
                    # self.writer.add_scalar('f1', ret['f1'], step)
                    msg_tmpl = 'step {} completed, accuracy {:.4f}'
                    self.logger.info(msg_tmpl.format(step, ret))
Esempio n. 5
0
 def evaluate_and_save_model(self, step, epoch):
     valid_f1 = self.evaluate(step)
     if self.best_f1 is None:
         self.best_f1 = valid_f1
         self.best_step = step
     elif valid_f1 >= self.best_f1:
         self.best_f1 = valid_f1
         self.not_update_count = 0
         self.best_step = step
     else:
         self.not_update_count += 1
     exp_name = self.args.exp_name
     model_basename = self.dest_dir + '{}_epoch_{}_batch_{}'.format(exp_name, epoch, step)
     torch.save(self.model.state_dict(), model_basename + '.model')
     write_json(model_basename + '.json', vars(self.args))
     score_msg_tmpl = 'best score: step {} macro f1@{} {:.4f}'
     self.logger.info(score_msg_tmpl.format(self.best_step, self.eval_topn[-1], self.best_f1))
     self.logger.info('epoch {} step {}, model saved'.format(epoch, step))
Esempio n. 6
0
    def train_func(self):
        step = 0
        plm_lr = self.args.plm_learning_rate
        rerank_lr = self.args.rank_learning_rate
        model = load_rerank_model(self.args)
        true_score_func = get_score_func(model, 'true', inference=False)
        false_score_func = get_score_func(model, 'false', inference=False)
        if torch.cuda.is_available():
            model.cuda()
        loss_fct = MarginRankingLoss(margin=1, reduction='mean')

        if self.args.separate_learning_rate:
            params = [(k, v) for k, v in model.named_parameters()
                      if v.requires_grad]
            non_bert_params = {
                'params':
                [v for k, v in params if not k.startswith('plm_model.')]
            }
            bert_params = {
                'params': [v for k, v in params if k.startswith('plm_model.')],
                'lr': plm_lr
            }
            # optimizer = torch.optim.Adam([bert_params, non_bert_params], lr=rerank_lr)
            optimizer = AdamW([non_bert_params, bert_params], lr=rerank_lr)
        else:
            optimizer = AdamW(model.parameters(), plm_lr)
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.args.scheduler_step,
            gamma=self.args.scheduler_gamma)
        accumulate_step = 0

        for epoch in range(1, self.args.epoch + 1):
            for batch in self.train_loader:
                model.train()
                true_scores = true_score_func(batch)
                false_scores = false_score_func(batch)
                # y all 1s to indicate positive should be higher
                y = torch.ones(len(true_scores)).float()
                if torch.cuda.is_available():
                    y = y.cuda()

                loss = loss_fct(true_scores, false_scores, y)
                loss.backward()
                self.writer.add_scalar('loss', loss, step)
                accumulate_step += 1

                # torch.nn.utils.clip_grad_value_(model.parameters(), 0.01)
                stop_scheduler_step = self.args.scheduler_step * 8
                if accumulate_step % self.args.gradient_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    # if self.args.scheduler_lr and step <= stop_scheduler_step:
                    if self.args.scheduler_lr:  # and step <= stop_scheduler_step:
                        scheduler.step()
                    accumulate_step = 0

                step += 1
                if step % self.args.save_model_step == 0:
                    model_basename = self.args.dest_base_dir + self.args.exp_name
                    model_basename += '_epoch_{}_step_{}'.format(epoch, step)
                    torch.save(model.state_dict(), model_basename + '.model')
                    write_json(model_basename + '.json', vars(self.args))
                    map_top3 = self.evaluate(model, 5, model_basename)
                    self.writer.add_scalar('map@3', map_top3, step)
                    self.logger.info('step {} map@3 {:.4f}'.format(
                        step, map_top3))