コード例 #1
0
    def on_exception(self, exception):
        if isinstance(exception, KeyboardInterrupt):
            logger.error(
                "[Error] Caught keyboard interrupt on worker. Stopping supervisor..."
            )
            state = {
                'iter': self.step,
                'encoder_state_dict': self.model.encoder.state_dict(),
                'decoder_state_dict': self.model.decoder.state_dict(),
                'reduce_state_dict': self.model.reduce_state.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'current_loss': self.running_avg_loss
            }

            model_save_path = os.path.join(self.config.model_path,
                                           'earlystop_step_%d.pkl' % self.step)

            # torch.save(state, model_save_path)

            #self.model.cpu()
            torch.save(self.model, model_save_path)
            #if self.config.use_gpu:
            #    self.model.cuda()

            logger.info('[INFO] Saving early stop model to %s',
                        model_save_path)

            if self.quit_all is True:
                sys.exit(0)  # 直接退出程序
            else:
                pass
        else:
            raise exception  # 抛出陌生Error
コード例 #2
0
    def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
        logger.info('   | end of valid {:3d} | time: {:5.2f}s | '.format(
            self.epoch, (time.time() - self.valid_start_time)))

        # early stop
        if not is_better_eval:
            if self.wait == self.patience:

                state = {
                    'iter': self.step,
                    'encoder_state_dict': self.model.encoder.state_dict(),
                    'decoder_state_dict': self.model.decoder.state_dict(),
                    'reduce_state_dict': self.model.reduce_state.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'current_loss': self.running_avg_loss
                }

                model_save_path = os.path.join(
                    self.config.model_path,
                    'earlystop_step_%d.pkl' % self.step)

                # torch.save(state, model_save_path)

                #self.model.cpu()
                torch.save(self.model, model_save_path)
                #if self.config.use_gpu:
                #    self.model.cuda()

                logger.info('[INFO] Saving early stop model to %s',
                            model_save_path)
                raise EarlyStopError("Early stopping raised.")
            else:
                self.wait += 1
        else:
            self.wait = 0
コード例 #3
0
ファイル: decode.py プロジェクト: zide05/pointer-gen-fastnlp
def set_up_data(mode, config):
    datainfo = prepare_dataInfo(mode=mode,
                                test_data_path=config.decode_data_path,
                                train_data_path=config.train_data_path,
                                vocab_size=config.vocab_size,
                                config=config)
    logger.info('-' * 10 + "set up data done!" + '-' * 10)
    return datainfo
コード例 #4
0
ファイル: test_CGSum.py プロジェクト: ChenxinAn-fdu/CGSum
def set_up_data(data_config):
    paths = {
        "train": os.path.join(data_config.train_path, args.train_file),
        "test": os.path.join(data_config.train_path, args.test_file)
    }

    datainfo, vocab = ScisummGraphLoader(setting=args.setting).process(
        paths, data_config, args.load_vocab)
    logger.info('-' * 10 + "set up data done!" + '-' * 10)
    return datainfo, vocab
コード例 #5
0
ファイル: train_CGSum.py プロジェクト: ChenxinAn-fdu/CGSum
def set_up_data():
    paths = {
        "train": os.path.join(config.train_path, "train.jsonl"),
        "dev": os.path.join(config.train_path, "val.jsonl")
    }

    datainfo, vocabs = ScisummGraphLoader(setting=args.setting).process(
        paths, config, args.load_vocab)
    logger.info('-' * 10 + "set up data done!" + '-' * 10)
    return datainfo, vocabs
コード例 #6
0
 def get_metric(self, reset=True):
     logger.info("[INFO] Hyps and Refer number is %d, %d",
                 len(self.prediction), len(self.referece))
     if len(self.prediction) == 0 or len(self.referece) == 0:
         logger.error("During testing, no hyps or refers is selected!")
         return
     rouge = Rouge()
     scores_all = rouge.get_scores(self.prediction, self.referece, avg=True)
     if reset:
         self.prediction = []
         self.referece = []
     logger.info(scores_all)
     scores_all = remend_score(scores_all)
     return scores_all
コード例 #7
0
def run_train(config):
    train_dir, model_dir = initial_dir('train', config)
    config.train_path = train_dir
    config.model_path = model_dir
    print_config(config, train_dir)
    datainfo = set_up_data('train', config)
    train_sampler = BucketSampler(batch_size=config.batch_size, seq_len_field_name='enc_len')
    criterion = MyLoss(config=config, padding_idx=datainfo.vocabs["train"].to_index(PAD_TOKEN))

    model = Model(vocab=datainfo.vocabs["train"], config=config)
    params = list(model.encoder.parameters()) + list(model.decoder.parameters()) + \
             list(model.reduce_state.parameters())
    initial_lr = config.lr_coverage if config.is_coverage else config.lr
    optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)

    train_loader = datainfo.datasets["train"]
    valid_loader = datainfo.datasets["dev"]
    summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
    trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion,
                      batch_size=config.batch_size, check_code_level=-1,
                      n_epochs=config.n_epochs, print_every=100, dev_data=valid_loader,
                      metrics=FastRougeMetric(pred='prediction', art_oovs='article_oovs',
                                              abstract_sentences='abstract_sentences', config=config,
                                              vocab=datainfo.vocabs["train"]),
                      metric_key="rouge-l-f", validate_every=-1, save_path=model_dir,
                      callbacks=[TrainCallback(config, summary_writer, patience=10)], use_tqdm=False,
                      device=config.visible_gpu)

    logger.info("-" * 5 + "start training" + "-" * 5)

    traininfo = trainer.train(load_best_model=True)
    logger.info('   | end of Train | time: {:5.2f}s | '.format(traininfo["seconds"]))
    logger.info('[INFO] best eval model in epoch %d and iter %d', traininfo["best_epoch"], traininfo["best_step"])
    logger.info(traininfo["best_eval"])

    bestmodel_save_path = os.path.join(config.model_path,
                                       'bestmodel.pkl')  # this is where checkpoints of best models are saved
    state = {
        'encoder_state_dict': model.encoder.state_dict(),
        'decoder_state_dict': model.decoder.state_dict(),
        'reduce_state_dict': model.reduce_state.state_dict()
    }
    torch.save(state, bestmodel_save_path)
    # 不是作为形参传入到Trainer里面的么,怎么里面的model变化会影响到外面的?
    logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path)
コード例 #8
0
ファイル: train_CGSum.py プロジェクト: ChenxinAn-fdu/CGSum
def run_train():
    datainfo, vocabs = set_up_data()
    train_sampler = RandomSampler()
    criterion = SummLoss(config=config, padding_idx=vocabs.to_index(PAD_TOKEN))
    model = CGSum(config, vocab=vocabs)
    model.to(device)

    initial_lr = config.lr
    logger.info(f"learning rate = {initial_lr}")
    optimizer = Adagrad(filter(lambda p: p.requires_grad, model.parameters()),
                        lr=initial_lr,
                        initial_accumulator_value=config.adagrad_init_acc)

    train_loader = datainfo.datasets["train"]
    valid_loader = datainfo.datasets["dev"]

    callbacks = [
        TrainCallback(config, patience=10),
        FitlogCallback(),
        LRDecayCallback(optimizer.param_groups, steps=args.weight_decay_step)
    ]
    trainer = Trainer(model=model,
                      train_data=train_loader,
                      optimizer=optimizer,
                      loss=criterion,
                      batch_size=config.batch_size,
                      check_code_level=-1,
                      sampler=train_sampler,
                      n_epochs=config.n_epochs,
                      print_every=100,
                      dev_data=valid_loader,
                      update_every=args.update_every,
                      metrics=FastRougeMetric(
                          pred='prediction',
                          art_oovs='article_oovs',
                          abstract_sentences='abstract_sentences',
                          config=config,
                          vocab=datainfo.vocabs["vocab"]),
                      metric_key="rouge-l-f",
                      validate_every=args.validate_every * args.update_every,
                      save_path=None,
                      callbacks=callbacks,
                      use_tqdm=True)

    logger.info("-" * 5 + "start training" + "-" * 5)
    traininfo = trainer.train(load_best_model=True)

    logger.info('   | end of Train | time: {:5.2f}s | '.format(
        traininfo["seconds"]))
    logger.info('[INFO] best eval model in epoch %d and iter %d',
                traininfo["best_epoch"], traininfo["best_step"])
コード例 #9
0
    def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
        logger.info('   | end of valid {:3d} | time: {:5.2f}s | '.format(
            self.epoch, (time.time() - self.valid_start_time)))
        # save the better checkpoint
        if is_better_eval:
            logger.info("got better results on dev, save checkpoint.. ")
            model_save_path = os.path.join(
                self.config.model_path,
                f'CGSum_{self.config.setting}_{self.config.n_hop}hopNbrs.pt')
            checkpoint = {
                "state_dict": self.model.state_dict(),
                "config": self.model.config.__dict__
            }
            torch.save(checkpoint, model_save_path)

        # early stop
        if not is_better_eval:
            if self.wait == self.patience:
                raise EarlyStopError("Early stopping raised.")
            else:
                self.wait += 1
        else:
            self.wait = 0
コード例 #10
0
    def on_backward_begin(self, loss):
        self.loss_update_every.append(loss.item())
        if isinstance(loss, tuple) and not np.isfinite(loss[0].item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss[0].item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        if self.step % self.update_every == 0:
            assert len(self.loss_update_every) == self.update_every
            loss_batch = sum(self.loss_update_every)
            self.loss_update_every = []
            # report the loss
            if self.step < 10 or self.step % 1000 == 0:
                logger.info(
                    "|epoch: %d  step: %d  log_loss: %.4f |" %
                    (self.epoch, self.step / self.update_every, loss_batch))
            self.running_avg_loss = calc_running_avg_loss(
                loss_batch, self.running_avg_loss,
                self.step / self.update_every)
コード例 #11
0
 def get_metric(self, reset=True):
     logger.info("[INFO] Hyps and Refer number is %d, %d",
                 len(self.prediction), len(self.referece))
     if len(self.prediction) == 0 or len(self.referece) == 0:
         logger.error("During testing, no hyps or refers is selected!")
         return
     if isinstance(self.referece[0], list):
         logger.info("Multi Reference summaries!")
         scores_all = pyrouge_score_all_multi(self.prediction,
                                              self.referece, self.config)
     else:
         scores_all = pyrouge_score_all(self.prediction, self.referece,
                                        self.config)
     if reset:
         self.prediction = []
         self.referece = []
     logger.info(scores_all)
     return scores_all
コード例 #12
0
    def on_backward_begin(self, loss):
        """
        :param loss: []
        :return:
        """
        print("|epoch: %d  step: %d  loss: %.4f|" %
              (self.epoch, self.step, loss.item()))
        if not np.isfinite(loss.item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss.item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        self.running_avg_loss = calc_running_avg_loss(loss.item(),
                                                      self.running_avg_loss,
                                                      self.summary_writer,
                                                      self.step)
コード例 #13
0
ファイル: main.py プロジェクト: zide05/pointer-gen-fastnlp
def set_up_data(mode):
    datainfo = prepare_dataInfo(mode, config.train_data_path, config.eval_data_path, config.decode_data_path,
                                config.vocab_path, config.vocab_size, config)
    logger.info('-' * 10 + "set up data done!" + '-' * 10)
    return datainfo
コード例 #14
0
 def on_epoch_end(self):
     logger.info('   | end of epoch {:3d} | time: {:5.2f}s | '.format(
         self.epoch, (time.time() - self.epoch_start_time)))
コード例 #15
0
ファイル: decode.py プロジェクト: zide05/pointer-gen-fastnlp
    parser.add_argument('-lr_coverage', default=0.15, type=float)
    parser.add_argument('-test_data_name', required=True, type=str)
    parser.add_argument('-test_model', default='', type=str)
    args = parser.parse_args()

    args.train_data_path = os.path.join(args.dataset_path,
                                        args.train_data_path)
    # args.eval_data_path = os.path.join(args.dataset_path, args.eval_data_path)
    args.decode_data_path = os.path.join(args.dataset_path,
                                         args.decode_data_path)
    # args.vocab_path = os.path.join(args.dataset_path, args.vocab_path)

    args.log_root = os.path.join(args.root, args.log_root)

    if args.visible_gpu != -1:
        args.use_gpu = True
        torch.cuda.set_device(args.visible_gpu)
        print("using gpu: ", args.visible_gpu)
    else:
        args.use_gpu = False

    logger.info("------start mode test-------")
    if args.test_model == '':
        k_model_path_list = getting_k_model_path(args.model_file_path,
                                                 args.top_k)
        for tmp_path in k_model_path_list:
            run_test(tmp_path, args)
    else:
        run_test(args.test_model, args)
コード例 #16
0
def prepare_dataInfo(mode,
                     vocab_size,
                     config,
                     train_data_path=None,
                     dev_data_path=None,
                     test_data_path=None):
    def sent_to_words(sents):
        result = []
        for sent in sents:
            result.extend([
                word.strip() for word in sent.split(" ")
                if len(word.strip()) != 0
            ])
        return result

    # dataloader = Cnn_dailymailLodaer()
    # 适用于输入是json的文件,每个json必须有field :text和summary,二者都是tokenized
    dataloader = JsonLoader(fields={
        "text": "words",
        "summary": "abstract_sentences"
    })
    if mode == 'train':
        if train_data_path is None or dev_data_path is None:
            print("training with no train data path or dev data path! ")
        paths = {"train": train_data_path, "dev": dev_data_path}
    else:
        if test_data_path is None:
            print("testing with no test data path ! ")
        paths = {"train": train_data_path, "test": test_data_path}
    # dataInfo = dataloader.process(paths, vocab_path, vocab_size)
    print("=" * 10)
    print(paths)
    dataInfo = dataloader.load(paths)
    for key, _dataset in dataInfo.datasets.items():
        _dataset.apply(lambda ins: " ".join(ins['words']),
                       new_field_name='article')
        _dataset.apply(lambda ins: sent_to_words(ins['words']),
                       new_field_name='words')
        _dataset.apply(
            lambda ins: sent_tokenize(" ".join(ins['abstract_sentences'])),
            new_field_name='abstract_sentences')

    vocab = Vocabulary(max_size=vocab_size - 2,
                       padding=PAD_TOKEN,
                       unknown=UNKNOWN_TOKEN)
    vocab.from_dataset(dataInfo.datasets['train'], field_name='words')
    vocab.add(START_DECODING)
    vocab.add(STOP_DECODING)
    print(vocab.to_word(0))
    print(len(vocab))
    assert vocab_size == len(vocab), "vocab_size error!!!"
    dataInfo.set_vocab(vocab, "train")

    for key, dataset in dataInfo.datasets.items():
        data_dict = {
            "enc_len": [],
            "enc_input": [],
            "dec_input": [],
            "target": [],
            "dec_len": [],
            "article_oovs": [],
            "enc_input_extend_vocab": []
        }

        for instance in dataset:
            article = instance["article"]
            abstract_sentences = instance["abstract_sentences"]

            enc_len, enc_input, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = getting_full_info(
                article, abstract_sentences, dataInfo.vocabs['train'], config)

            data_dict["enc_len"].append(enc_len)
            data_dict["enc_input"].append(enc_input)
            data_dict["dec_input"].append(dec_input)
            data_dict["target"].append(target)
            data_dict["dec_len"].append(dec_len)
            data_dict["article_oovs"].append(article_oovs)
            data_dict["enc_input_extend_vocab"].append(enc_input_extend_vocab)

        logger.info("-----prepare_dataInfo for dataset " + key + "-----")
        logger.info(
            str(len(data_dict["enc_len"])) + " " +
            str(len(data_dict["enc_input"])) + " " +
            str(len(data_dict["dec_input"])) + " " +
            str(len(data_dict["target"])) + " " +
            str(len(data_dict["dec_len"])) + " " +
            str(len(data_dict["article_oovs"])) + " " +
            str(len(data_dict["enc_input_extend_vocab"])))
        dataset.add_field("enc_len", data_dict["enc_len"])
        dataset.add_field("enc_input", data_dict["enc_input"])
        dataset.add_field("dec_input", data_dict["dec_input"])
        dataset.add_field("target", data_dict["target"])
        dataset.add_field("dec_len", data_dict["dec_len"])
        dataset.add_field("article_oovs", data_dict["article_oovs"])
        dataset.add_field("enc_input_extend_vocab",
                          data_dict["enc_input_extend_vocab"])

        dataset.set_input("enc_len", "enc_input", "dec_input", "dec_len",
                          "article_oovs", "enc_input_extend_vocab")
        dataset.set_target("target", "article_oovs", "abstract_sentences")
    '''
    for name, dataset in dataInfo.datasets.items():
        for field_name in dataset.get_field_names():
            dataset.apply_field(convert_list_to_ndarray, field_name=field_name, new_field_name=field_name)
    '''
    return dataInfo
コード例 #17
0
ファイル: train_CGSum.py プロジェクト: ChenxinAn-fdu/CGSum
    config.max_graph_enc_steps = args.max_graph_enc_steps
    config.min_dec_steps = args.min_dec_steps

    # mode
    config.mode = args.mode
    config.setting = args.setting

    # save model
    if not os.path.exists(config.model_path):
        if config.model_path.__contains__("/"):
            os.makedirs(config.model_path, 0o777)
        else:
            os.mkdir(config.model_path)

    # fitlog dir
    logger.info(f"set fitlog dir to {args.fitlog_dir}")
    if not os.path.exists(args.fitlog_dir):
        os.mkdir(args.fitlog_dir)
    fitlog.set_log_dir(args.fitlog_dir)
    fitlog.add_hyper(args)

    if not os.path.exists(config.model_path):
        os.mkdir(config.model_path)

    if args.visible_gpu != -1:
        config.use_gpu = True
        torch.cuda.set_device(args.visible_gpu)
        device = torch.device(args.visible_gpu)
    else:
        config.use_gpu = False
コード例 #18
0
    parser.add_argument('-max_grad_norm', default=2.0, type=float)

    parser.add_argument('-is_pointer_gen', dest='pointer_gen', nargs='?', const=True, default=False,
                        type=bool)
    parser.add_argument('-is_coverage', nargs='?', const=True, default=False, type=bool)
    parser.add_argument('-cov_loss_wt', default=1.0, type=float)

    parser.add_argument('-eps', default=1e-12, type=float)
    # parser.add_argument('-max_iterations', default=500000, required=True, type=int)
    parser.add_argument("-n_epochs", default=33, type=int, required=True)

    parser.add_argument('-lr_coverage', default=0.15, type=float)
    args = parser.parse_args()

    args.train_data_path = os.path.join(args.dataset_path, args.train_data_path)
    args.eval_data_path = os.path.join(args.dataset_path, args.eval_data_path)
    # args.decode_data_path = os.path.join(args.dataset_path, args.decode_data_path)
    # args.vocab_path = os.path.join(args.dataset_path, args.vocab_path)

    args.log_root = os.path.join(args.root, args.log_root)

    if args.visible_gpu != -1:
        args.use_gpu = True
        torch.cuda.set_device(args.visible_gpu)
        print("using gpu: ", args.visible_gpu)
    else:
        args.use_gpu = False

    logger.info("------start mode train------")
    run_train(args)
コード例 #19
0
    def process(self, paths, config, load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """

        vocab_size = config.vocab_size

        def _merge_abstracts(abstracts):
            merged = []
            for abstract in abstracts:
                merged.extend(abstract[:self.max_concat_len] + [SEP])
            if len(abstracts) == 0:
                assert merged == []
            return merged[:-1]

        def _pad_graph_inputs(graph_inputs):
            pad_text_wd = []
            max_len = config.max_graph_enc_steps

            for graph_input in graph_inputs:
                if len(graph_input) < max_len:
                    pad_num = max_len - len(graph_input)
                    graph_input.extend([PAD_TOKEN] * pad_num)
                else:
                    graph_input = graph_input[:max_len]
                pad_text_wd.append(graph_input)

            if len(pad_text_wd) == 0:
                pad_text_wd.append([PAD_TOKEN] * max_len)

            return pad_text_wd

        def _get_nbr_input_len(input_wd):
            enc_len = [
                min(len(text), config.max_graph_enc_steps) for text in input_wd
            ]
            if len(enc_len) == 0:
                enc_len = [0]
            return enc_len

        def _pad_article(text_wd):
            token_num = len(text_wd)
            max_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_len += self.max_concat_len * self.max_concat_num
            if token_num < max_len:
                padding = [PAD_TOKEN] * (max_len - token_num)
                article = text_wd + padding
            else:
                article = text_wd[:max_len]
            return article

        def _split_list(input_list):
            return [text.split() for text in input_list]

        def sent_tokenize(abstract):
            abs_list = abstract.split(".")
            return [(abst + ".") for abst in abs_list[:-1]]

        def _article_token_mask(text_wd):
            max_enc_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_enc_len += self.max_concat_len * self.max_concat_num
            token_num = len(text_wd)
            if token_num < max_enc_len:
                mask = [1] * token_num + [0] * (max_enc_len - token_num)
            else:
                mask = [1] * max_enc_len
            return mask

        def generate_article_input(text, abstracts):
            if config.neighbor_process == "sep":
                text_wd = text.split()[:config.max_enc_steps]
                text_wd.append(SEP)
                abstracts_wd = _merge_abstracts(abstracts)
                return text_wd + abstracts_wd
            else:
                return text.split()

        def generate_graph_inputs(graph_struct):

            graph_inputs_ = [
                graph_strut_dict[pid][config.graph_input_type]
                for pid in graph_struct
            ]
            return _split_list(graph_inputs_[1:])

        def generate_graph_structs(paper_id):
            sub_graph_dict = {}
            sub_graph_set = []

            n_hop = config.n_hop
            max_neighbor_num = config.max_neighbor_num
            k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num)
            for sub_g in k_nbrs:
                sub_graph_set += sub_g

            for node in sub_graph_set:
                sub_graph_dict[node] = []

            for sub_g in k_nbrs:
                for centre_node in sub_g:
                    nbrs = graph_strut_dict[centre_node]['references']
                    c_nbrs = list(set(nbrs).intersection(sub_graph_set))
                    sub_graph_dict[centre_node].extend(c_nbrs)
                    for c_nbr in c_nbrs:
                        sub_graph_dict[c_nbr].append(centre_node)
            # in python 3.6, the first in subgraph dict is source paper
            return sub_graph_dict

        def _k_hop_neighbor(paper_id, n_hop, max_neighbor):
            sub_graph = [[] for _ in range(n_hop + 1)]
            level = 0
            visited = set()
            q = deque()
            q.append([paper_id, level])
            curr_node_num = 0
            while len(q) != 0:
                paper_first = q.popleft()
                paper_id_first, level_first = paper_first
                if level_first > n_hop:
                    return sub_graph
                sub_graph[level_first].append(paper_id_first)
                curr_node_num += 1
                if curr_node_num > max_neighbor:
                    return sub_graph
                visited.add(paper_id_first)
                for pid in graph_strut_dict[paper_id_first]["references"]:
                    if pid not in visited and pid in graph_strut_dict:
                        q.append([pid, level_first + 1])
                        visited.add(pid)

            return sub_graph

        def generate_dgl_graph(paper_id, graph_struct, nodes_num):
            g = dgl.DGLGraph()
            assert len(graph_struct) == nodes_num

            g.add_nodes(len(graph_struct))
            pid2idx = {}
            for index, key_node in enumerate(graph_struct):
                pid2idx[key_node] = index
            assert pid2idx[paper_id] == 0

            for index, key_node in enumerate(graph_struct):
                neighbor = [pid2idx[node] for node in graph_struct[key_node]]
                # add self loop
                neighbor.append(index)
                key_nodes = [index] * len(neighbor)
                g.add_edges(key_nodes, neighbor)
            return g

        train_ds = None
        dataInfo = self.load(paths)

        # pop nodes in train graph in inductive setting
        if config.mode == "test" and self.setting == "inductive":
            dataInfo.datasets.pop("train")

        graph_strut_dict = {}
        for key, ds in dataInfo.datasets.items():
            for ins in ds:
                graph_strut_dict[ins["paper_id"]] = ins

        logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes")

        for key, ds in dataInfo.datasets.items():
            # process summary
            ds.apply(lambda x: x['abstract'].split(),
                     new_field_name='summary_wd')
            ds.apply(lambda x: sent_tokenize(x['abstract']),
                     new_field_name='abstract_sentences')
            # generate graph

            ds.apply(lambda x: generate_graph_structs(x["paper_id"]),
                     new_field_name="graph_struct")
            ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]),
                     new_field_name='graph_inputs_wd')

            ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1,
                     new_field_name="nodes_num")
            # pad input
            ds.apply(lambda x: generate_article_input(x['introduction'], x[
                "graph_inputs_wd"]),
                     new_field_name='input_wd')
            ds.apply(lambda x: _article_token_mask(x["input_wd"]),
                     new_field_name="enc_len_mask")
            ds.apply(lambda x: sum(x["enc_len_mask"]),
                     new_field_name="enc_len")
            ds.apply(lambda x: _pad_article(x["input_wd"]),
                     new_field_name="pad_input_wd")

            ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]),
                     new_field_name="nbr_inputs_len")

            ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]),
                     new_field_name="pad_graph_inputs_wd")
            if key == "train":
                train_ds = ds

        vocab_dict = {}
        if not load_vocab_file:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds is None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=config.vocab_size - 2,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.from_dataset(train_ds,
                                field_name=["input_wd", "summary_wd"])
            vocabs.add_word(START_DECODING)
            vocabs.add_word(STOP_DECODING)
            vocab_dict["vocab"] = vocabs
            # save vocab
            with open(os.path.join(config.train_path, "vocab"),
                      "w",
                      encoding="utf8") as f:
                for w, idx in vocabs:
                    f.write(str(w) + "\t" + str(idx) + "\n")
            logger.info(
                "build new vocab ends.. please reRun the code with load_vocab = True"
            )
            exit(0)
        else:

            logger.info("[INFO] Load existing vocab from %s!" %
                        config.vocab_path)
            word_list = []
            cnt = 3  # pad and unk
            if config.neighbor_process == "sep":
                cnt += 1

            with open(config.vocab_path, 'r', encoding='utf8') as vocab_f:
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.add_word_lst(word_list)
            vocabs.add(START_DECODING)
            vocabs.add(STOP_DECODING)
            if config.neighbor_process == "sep":
                vocabs.add(SEP)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        logger.info(f"vocab size = {len(vocabs)}")
        assert len(vocabs) == config.vocab_size
        dataInfo.set_vocab(vocabs, "vocab")

        for key, dataset in dataInfo.datasets.items():
            # do not process the training set in test mode
            if config.mode == "test" and key == "train":
                continue

            data_dict = {
                "enc_input": [],
                "nbr_inputs": [],
                "graph": [],
                "dec_input": [],
                "target": [],
                "dec_len": [],
                "article_oovs": [],
                "enc_input_extend_vocab": [],
            }
            logger.info(
                f"start construct the input of the model for {key} set, please wait..."
            )
            for instance in dataset:
                graph_inputs = instance["pad_graph_inputs_wd"]
                abstract_sentences = instance["summary_wd"]
                enc_input = instance["pad_input_wd"]
                enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \
                    getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config)
                graph = generate_dgl_graph(instance["paper_id"],
                                           instance["graph_struct"],
                                           instance["nodes_num"])
                data_dict["graph"].append(graph)
                data_dict["enc_input"].append(enc_input)
                data_dict["nbr_inputs"].append(nbr_inputs)
                data_dict["dec_input"].append(dec_input)
                data_dict["target"].append(target)
                data_dict["dec_len"].append(dec_len)
                data_dict["article_oovs"].append(article_oovs)
                data_dict["enc_input_extend_vocab"].append(
                    enc_input_extend_vocab)

            dataset.add_field("enc_input", data_dict["enc_input"])
            dataset.add_field("nbr_inputs", data_dict["nbr_inputs"])
            dataset.add_field("dec_input", data_dict["dec_input"])
            dataset.add_field("target", data_dict["target"])
            dataset.add_field("dec_len", data_dict["dec_len"])
            dataset.add_field("article_oovs", data_dict["article_oovs"])
            dataset.add_field("enc_input_extend_vocab",
                              data_dict["enc_input_extend_vocab"])

            dataset.add_field("graph", data_dict["graph"])
            dataset.set_ignore_type(
                'graph')  # without this line, there may be some errors
            dataset.set_input("graph")

            dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len",
                              "enc_input", "enc_len_mask", "dec_input",
                              "dec_len", "article_oovs", "nodes_num",
                              "enc_input_extend_vocab")
            dataset.set_target("target", "article_oovs", "abstract_sentences")

            dataset.delete_field('graph_inputs_wd')
            dataset.delete_field('pad_graph_inputs_wd')
            dataset.delete_field('input_wd')
            dataset.delete_field('pad_input_wd')
        logger.info("------load dataset over---------")
        return dataInfo, vocabs
コード例 #20
0
ファイル: test_CGSum.py プロジェクト: ChenxinAn-fdu/CGSum
                        help="path to generated abstracts")
    parser.add_argument("--setting",
                        default="inductive",
                        choices=["transductive", "inductive"])
    args = parser.parse_args()

    config = Config()
    # load checkpoint
    if args.model_name is None:
        cpts = glob.glob(os.path.join(args.model_dir, "CGSum*"))
        cpts.sort(key=os.path.getmtime)
        # choice the last checkpoint by default
        cpt_file = cpts[-1]
    else:
        cpt_file = os.path.join(args.model_dir, args.model_name)
    logger.info(f"loading checkpoint from: {cpt_file}")

    checkpoint = torch.load(cpt_file)
    # load the config file
    config.__dict__ = checkpoint["config"]
    config.min_dec_steps = args.min_dec_steps
    config.max_dec_steps = args.max_dec_steps
    config.max_graph_enc_steps = args.max_graph_enc_steps
    # write args to config
    # paths config
    config.train_path = args.dataset_dir if args.dataset_dir is not None else os.path.join(
        "SSN", args.setting)
    config.vocab_path = os.path.join(config.train_path, args.vocab_file)
    config.model_path = args.model_dir
    config.decode_path = args.decode_dir