Example #1
0
 def __init__(self, args):
     super().__init__(args)
     print("args: ", args)
     self.args = args
     self.config_path = args.config_path
     self.mask_prob = 0.15
     self.max_len = args.max_len
     self.bert_tokenizer = BertWordPieceTokenizer(os.path.join(args.bert_path, "vocab.txt"))
     self.tokenizer = BertTokenizer.from_pretrained(os.path.join(args.bert_path, "vocab.txt"))
     self.prev_tokens = []
     self.prev_pinyins = []
     set_random_seed(12)
     # 加载拼音映射字典
     with open(os.path.join(self.config_path, 'pinyin_map.json')) as fin:
         self.pinyin_dict = json.load(fin)
     # 加载字符id映射tensor
     with open(os.path.join(self.config_path, 'id2pinyin.json')) as fin:
         self.id2pinyin = json.load(fin)
     # 加载拼音映射tensor
     with open(os.path.join(self.config_path, 'pinyin2tensor.json')) as fin:
         self.pinyin2tensor = json.load(fin)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# file: inference.py

import os
import torch
import argparse
from torch.utils.data import DataLoader
from utils.radom_seed import set_random_seed
set_random_seed(0)
from trainer import BertLabeling
from tokenizers import BertWordPieceTokenizer
from datasets.mrc_ner_dataset import MRCNERDataset
from metrics.functional.query_span_f1 import extract_flat_spans, extract_nested_spans

def get_dataloader(config, data_prefix="test"):
    data_path = os.path.join(config.data_dir, f"mrc-ner.{data_prefix}")
    vocab_path = os.path.join(config.bert_dir, "vocab.txt")
    data_tokenizer = BertWordPieceTokenizer(vocab_path)

    dataset = MRCNERDataset(json_path=data_path,
                            tokenizer=data_tokenizer,
                            max_length=config.max_length,
                            is_chinese=config.is_chinese,
                            pad_to_maxlen=False)

    dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False)

    return dataloader, data_tokenizer