predict_output_json_file_name = os.path.join( model_path, 'pred_2020-05-14-23-33-55.json') predict_output_file_name = os.path.join(model_path, 'pred_2020-05-14-23-33-55.txt') logger.warn('加载标签映射关系') json_file_iter = read_json_file_iter(label_json_file_name) label_link_dict = dict() for row_json in json_file_iter: label_link_dict[row_json['label_desc']] = row_json['label'] logger.info(label_link_dict) logger.warn('开始加载模型') model = torch.load(model_name) model.eval() logger.info('模型加载完毕:\n{}'.format(model)) logger.warn('获取词典') char_vocab = load_serialize_obj(char_vocab_pkl_file) logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = load_serialize_obj(target_vocab_pkl_file) logger.info('target_vocab:{}'.format(target_vocab)) logger.warn('加载测试数据') json_file_iter = read_json_file_iter(test_data_json_file_name) predictor = Predictor(model) with codecs.open( predict_output_json_file_name, mode='w', encoding='utf8') as fw_json, codecs.open(predict_output_file_name, mode='w', encoding='utf8') as fw: for i, row_json in enumerate(json_file_iter): if i % 100 == 0: logger.info('predict row:{}'.format(i)) sentence = row_json.get('sentence', '')
from myClue.core import logger # noqa from myClue.core.callback import EarlyStopCallback # noqa from myClue.tools.serialize import load_serialize_obj # noqa from myClue.tools.serialize import save_serialize_obj # noqa from myClue.tools.file import init_file_path # noqa if __name__ == "__main__": train_data_bundle_pkl_file = './data/weibo_NER/train_data_bundle.pkl' model_path = './data/weibo_NER/model_bilstm_crf_bert_embed' init_file_path(model_path) logger.add_file_handler(os.path.join(model_path, 'log_{}.txt'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())))) # 日志写入文件 char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl') target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl') logger.warn('加载数据集') data_bundle = load_serialize_obj(train_data_bundle_pkl_file) logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') # model_dir_or_name = 'cn-wwm' model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch' bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False) logger.warn('神经网络模型') model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5, target_vocab=target_vocab)