Esempio n. 1
0
 predict_output_json_file_name = os.path.join(
     model_path, 'pred_2020-05-14-23-33-55.json')
 predict_output_file_name = os.path.join(model_path,
                                         'pred_2020-05-14-23-33-55.txt')
 logger.warn('加载标签映射关系')
 json_file_iter = read_json_file_iter(label_json_file_name)
 label_link_dict = dict()
 for row_json in json_file_iter:
     label_link_dict[row_json['label_desc']] = row_json['label']
 logger.info(label_link_dict)
 logger.warn('开始加载模型')
 model = torch.load(model_name)
 model.eval()
 logger.info('模型加载完毕:\n{}'.format(model))
 logger.warn('获取词典')
 char_vocab = load_serialize_obj(char_vocab_pkl_file)
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = load_serialize_obj(target_vocab_pkl_file)
 logger.info('target_vocab:{}'.format(target_vocab))
 logger.warn('加载测试数据')
 json_file_iter = read_json_file_iter(test_data_json_file_name)
 predictor = Predictor(model)
 with codecs.open(
         predict_output_json_file_name, mode='w',
         encoding='utf8') as fw_json, codecs.open(predict_output_file_name,
                                                  mode='w',
                                                  encoding='utf8') as fw:
     for i, row_json in enumerate(json_file_iter):
         if i % 100 == 0:
             logger.info('predict row:{}'.format(i))
         sentence = row_json.get('sentence', '')
Esempio n. 2
0
from myClue.core import logger  # noqa
from myClue.core.callback import EarlyStopCallback  # noqa
from myClue.tools.serialize import load_serialize_obj  # noqa
from myClue.tools.serialize import save_serialize_obj  # noqa
from myClue.tools.file import init_file_path  # noqa


if __name__ == "__main__":
    train_data_bundle_pkl_file = './data/weibo_NER/train_data_bundle.pkl'
    model_path = './data/weibo_NER/model_bilstm_crf_bert_embed'
    init_file_path(model_path)
    logger.add_file_handler(os.path.join(model_path, 'log_{}.txt'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))))  # 日志写入文件
    char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl')
    target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
    logger.warn('加载数据集')
    data_bundle = load_serialize_obj(train_data_bundle_pkl_file)
    logger.warn('获取词典')
    char_vocab = data_bundle.get_vocab('words')
    logger.info('char_vocab:{}'.format(char_vocab))
    target_vocab = data_bundle.get_vocab('target')
    logger.info('target_vocab:{}'.format(target_vocab))
    save_serialize_obj(char_vocab, char_vocab_pkl_file)
    save_serialize_obj(target_vocab, target_vocab_pkl_file)
    logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
    logger.warn('选择预训练词向量')
    # model_dir_or_name = 'cn-wwm'
    model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch'
    bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False)
    logger.warn('神经网络模型')
    model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5,
                      target_vocab=target_vocab)