def get_dataloader(self, prefix="train", limit: int = None): json_path = os.path.join(self.data_dir, f"mrc-ner.{prefix}") dataset = MRCNERDataset(json_path=json_path, tokenizer=self.tokenizer, max_length=self.args.max_length, possible_only=self.args.answerable_only, is_chinese=self.args.is_chinese, pad_to_maxlen=False, negative_sampling=self.args.negative_sampling, prefix=prefix, data_sign=self.args.data_sign, do_lower_case=self.args.do_lower_case, pred_answerable=self.args.pred_answerable) if limit is not None: dataset = TruncateDataset(dataset, limit) if prefix == "train": batch_size = self.train_batch_size # define data_generator will help experiment reproducibility. # cannot use random data sampler since the gradient may explode. data_generator = torch.Generator() data_generator.manual_seed(self.args.seed) data_sampler = RandomSampler(dataset, generator=data_generator) else: data_sampler = SequentialSampler(dataset) batch_size = self.eval_batch_size dataloader = DataLoader(dataset=dataset, sampler=data_sampler, batch_size=batch_size, num_workers=self.args.workers, collate_fn=collate_to_max_length) return dataloader
def get_dataloader(self, prefix="train", limit: int = None) -> DataLoader: """get training dataloader""" """ load_mmap_dataset """ json_path = os.path.join(self.data_dir, f"mrc-ner.{prefix}") vocab_path = os.path.join(self.bert_dir, "vocab.txt") dataset = MRCNERDataset(json_path=json_path, tokenizer=BertWordPieceTokenizer(vocab_file=vocab_path), max_length=self.args.max_length, is_chinese=self.chinese, pad_to_maxlen=False ) if limit is not None: dataset = TruncateDataset(dataset, limit) dataloader = DataLoader( dataset=dataset, batch_size=self.args.batch_size, num_workers=self.args.workers, shuffle=True if prefix == "train" else False, collate_fn=collate_to_max_length ) return dataloader
def get_dataloader(config, data_prefix="test"): data_path = os.path.join(config.data_dir, f"mrc-ner.{data_prefix}") vocab_path = os.path.join(config.bert_dir, "vocab.txt") data_tokenizer = BertWordPieceTokenizer(vocab_path) dataset = MRCNERDataset(json_path=data_path, tokenizer=data_tokenizer, max_length=config.max_length, is_chinese=config.is_chinese, pad_to_maxlen=False) dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False) return dataloader, data_tokenizer
bert_config = BertQueryNerConfig.from_pretrained( args.bert_config_dir, hidden_dropout_prob=args.bert_dropout, attention_probs_dropout_prob=args.bert_dropout, mrc_dropout=args.mrc_dropout) model = BertQueryNER.from_pretrained(args.bert_config_dir, config=bert_config).to(device) log = Logger(os.path.join(args.output_dir, "all.log"), level='debug') log.logger.info('开始训练') train_json_path = os.path.join(json_path, 'mrc-ner.train') dev_json_path = os.path.join(json_path, 'mrc-ner.dev') train_dataset = MRCNERDataset(json_path=train_json_path, tokenizer=tokenizer, is_chinese=is_chinese) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=collate_to_max_length, shuffle=True) dev_dataset = MRCNERDataset(json_path=dev_json_path, tokenizer=tokenizer, is_chinese=is_chinese) dev_dataloader = DataLoader(dev_dataset, batch_size=args.batch_size, collate_fn=collate_to_max_length) train(model, train_dataloader, args, dev_dataloader) print(
output_dir = os.path.join(args.output_dir, "best_f1_checkpoint") bert_config = BertQueryNerConfig.from_pretrained( output_dir, hidden_dropout_prob=args.bert_dropout, attention_probs_dropout_prob=args.bert_dropout, mrc_dropout=args.mrc_dropout) model = BertQueryNER.from_pretrained(output_dir, config=bert_config).to(device) train_json_path = os.path.join(json_path, 'mrc-ner.train') dev_json_path = os.path.join(json_path, 'mrc-ner.dev') test_json_path = os.path.join(json_path, 'mrc-ner.test') dev_dataset = MRCNERDataset(json_path=dev_json_path, tokenizer=tokenizer, possible_only=False, is_chinese=is_chinese) dev_dataloader = DataLoader(dev_dataset, batch_size=1, collate_fn=collate_to_max_length) print( '----------------------------------------------------------------------' ) span_recall, span_precision, span_f1 = dev(model, dev_dataloader, args) print('recall:{:f} ,precision:{:f} ,f1_score:{:f}'.format( span_recall, span_precision, span_f1))