Esempio n. 1
0
    def get_dataloader(self, prefix="train", limit: int = None):
        json_path = os.path.join(self.data_dir, f"mrc-ner.{prefix}")
        dataset = MRCNERDataset(json_path=json_path,
                                tokenizer=self.tokenizer,
                                max_length=self.args.max_length,
                                possible_only=self.args.answerable_only,
                                is_chinese=self.args.is_chinese,
                                pad_to_maxlen=False,
                                negative_sampling=self.args.negative_sampling,
                                prefix=prefix,
                                data_sign=self.args.data_sign,
                                do_lower_case=self.args.do_lower_case,
                                pred_answerable=self.args.pred_answerable)

        if limit is not None:
            dataset = TruncateDataset(dataset, limit)

        if prefix == "train":
            batch_size = self.train_batch_size
            # define data_generator will help experiment reproducibility.
            # cannot use random data sampler since the gradient may explode.
            data_generator = torch.Generator()
            data_generator.manual_seed(self.args.seed)
            data_sampler = RandomSampler(dataset, generator=data_generator)
        else:
            data_sampler = SequentialSampler(dataset)
            batch_size = self.eval_batch_size

        dataloader = DataLoader(dataset=dataset,
                                sampler=data_sampler,
                                batch_size=batch_size,
                                num_workers=self.args.workers,
                                collate_fn=collate_to_max_length)

        return dataloader
Esempio n. 2
0
    def get_dataloader(self, prefix="train", limit: int = None) -> DataLoader:
        """get training dataloader"""
        """
        load_mmap_dataset
        """
        json_path = os.path.join(self.data_dir, f"mrc-ner.{prefix}")
        vocab_path = os.path.join(self.bert_dir, "vocab.txt")
        dataset = MRCNERDataset(json_path=json_path,
                                tokenizer=BertWordPieceTokenizer(vocab_file=vocab_path),
                                max_length=self.args.max_length,
                                is_chinese=self.chinese,
                                pad_to_maxlen=False
                                )

        if limit is not None:
            dataset = TruncateDataset(dataset, limit)

        dataloader = DataLoader(
            dataset=dataset,
            batch_size=self.args.batch_size,
            num_workers=self.args.workers,
            shuffle=True if prefix == "train" else False,
            collate_fn=collate_to_max_length
        )

        return dataloader
def get_dataloader(config, data_prefix="test"):
    data_path = os.path.join(config.data_dir, f"mrc-ner.{data_prefix}")
    vocab_path = os.path.join(config.bert_dir, "vocab.txt")
    data_tokenizer = BertWordPieceTokenizer(vocab_path)

    dataset = MRCNERDataset(json_path=data_path,
                            tokenizer=data_tokenizer,
                            max_length=config.max_length,
                            is_chinese=config.is_chinese,
                            pad_to_maxlen=False)

    dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False)

    return dataloader, data_tokenizer
Esempio n. 4
0
    bert_config = BertQueryNerConfig.from_pretrained(
        args.bert_config_dir,
        hidden_dropout_prob=args.bert_dropout,
        attention_probs_dropout_prob=args.bert_dropout,
        mrc_dropout=args.mrc_dropout)
    model = BertQueryNER.from_pretrained(args.bert_config_dir,
                                         config=bert_config).to(device)

    log = Logger(os.path.join(args.output_dir, "all.log"), level='debug')
    log.logger.info('开始训练')

    train_json_path = os.path.join(json_path, 'mrc-ner.train')
    dev_json_path = os.path.join(json_path, 'mrc-ner.dev')

    train_dataset = MRCNERDataset(json_path=train_json_path,
                                  tokenizer=tokenizer,
                                  is_chinese=is_chinese)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  collate_fn=collate_to_max_length,
                                  shuffle=True)
    dev_dataset = MRCNERDataset(json_path=dev_json_path,
                                tokenizer=tokenizer,
                                is_chinese=is_chinese)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=args.batch_size,
                                collate_fn=collate_to_max_length)

    train(model, train_dataloader, args, dev_dataloader)

    print(
Esempio n. 5
0
    output_dir = os.path.join(args.output_dir, "best_f1_checkpoint")

    bert_config = BertQueryNerConfig.from_pretrained(
        output_dir,
        hidden_dropout_prob=args.bert_dropout,
        attention_probs_dropout_prob=args.bert_dropout,
        mrc_dropout=args.mrc_dropout)
    model = BertQueryNER.from_pretrained(output_dir,
                                         config=bert_config).to(device)

    train_json_path = os.path.join(json_path, 'mrc-ner.train')
    dev_json_path = os.path.join(json_path, 'mrc-ner.dev')
    test_json_path = os.path.join(json_path, 'mrc-ner.test')

    dev_dataset = MRCNERDataset(json_path=dev_json_path,
                                tokenizer=tokenizer,
                                possible_only=False,
                                is_chinese=is_chinese)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=1,
                                collate_fn=collate_to_max_length)

    print(
        '----------------------------------------------------------------------'
    )

    span_recall, span_precision, span_f1 = dev(model, dev_dataloader, args)
    print('recall:{:f} ,precision:{:f} ,f1_score:{:f}'.format(
        span_recall, span_precision, span_f1))