def __init__(self, args): super().__init__(args) print("args: ", args) self.args = args self.config_path = args.config_path self.mask_prob = 0.15 self.max_len = args.max_len self.bert_tokenizer = BertWordPieceTokenizer(os.path.join(args.bert_path, "vocab.txt")) self.tokenizer = BertTokenizer.from_pretrained(os.path.join(args.bert_path, "vocab.txt")) self.prev_tokens = [] self.prev_pinyins = [] set_random_seed(12) # 加载拼音映射字典 with open(os.path.join(self.config_path, 'pinyin_map.json')) as fin: self.pinyin_dict = json.load(fin) # 加载字符id映射tensor with open(os.path.join(self.config_path, 'id2pinyin.json')) as fin: self.id2pinyin = json.load(fin) # 加载拼音映射tensor with open(os.path.join(self.config_path, 'pinyin2tensor.json')) as fin: self.pinyin2tensor = json.load(fin)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # file: inference.py import os import torch import argparse from torch.utils.data import DataLoader from utils.radom_seed import set_random_seed set_random_seed(0) from trainer import BertLabeling from tokenizers import BertWordPieceTokenizer from datasets.mrc_ner_dataset import MRCNERDataset from metrics.functional.query_span_f1 import extract_flat_spans, extract_nested_spans def get_dataloader(config, data_prefix="test"): data_path = os.path.join(config.data_dir, f"mrc-ner.{data_prefix}") vocab_path = os.path.join(config.bert_dir, "vocab.txt") data_tokenizer = BertWordPieceTokenizer(vocab_path) dataset = MRCNERDataset(json_path=data_path, tokenizer=data_tokenizer, max_length=config.max_length, is_chinese=config.is_chinese, pad_to_maxlen=False) dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False) return dataloader, data_tokenizer