Esempio n. 1
0
 data_bundle = data_pipe.process(data_bundle)
 data_bundle.rename_field(field_name=Const.CHAR_INPUT,
                          new_field_name=Const.INPUT,
                          ignore_miss_dataset=True,
                          rename_vocab=True)
 print_data_bundle(data_bundle)
 model_path = './data/UCAS_NLP_TC/model_textcnn_topk'
 init_file_path(model_path)
 logger.add_file_handler(
     os.path.join(
         model_path, 'log_{}.txt'.format(
             time.strftime("%Y-%m-%d-%H-%M-%S",
                           time.localtime()))))  # 日志写入文件
 char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl')
 target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
 logger.warn('获取词典')
 char_vocab = data_bundle.get_vocab('words')
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = data_bundle.get_vocab('target')
 logger.info('target_vocab:{}'.format(target_vocab))
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 word2vec_embed = StaticEmbedding(char_vocab,
                                  model_dir_or_name='cn-char-fastnlp-100d')
 logger.warn('神经网络模型')
 model = CNNText(word2vec_embed, num_classes=len(target_vocab))
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = CrossEntropyLoss()
Esempio n. 2
0
if __name__ == "__main__":
    """
    预测示例输出结果:{"id": 0, "label": "102", "label_desc": "news_entertainment"}
    """
    model_path = './data/tnews_public/model_textcnn'
    test_data_json_file_name = './data/tnews_public/test.json'
    label_json_file_name = './data/tnews_public/labels.json'
    char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl')
    target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
    model_name = os.path.join(model_path, 'best_CNNText_f_2020-05-14-23-33-55')
    predict_output_json_file_name = os.path.join(
        model_path, 'pred_2020-05-14-23-33-55.json')
    predict_output_file_name = os.path.join(model_path,
                                            'pred_2020-05-14-23-33-55.txt')
    logger.warn('加载标签映射关系')
    json_file_iter = read_json_file_iter(label_json_file_name)
    label_link_dict = dict()
    for row_json in json_file_iter:
        label_link_dict[row_json['label_desc']] = row_json['label']
    logger.info(label_link_dict)
    logger.warn('开始加载模型')
    model = torch.load(model_name)
    model.eval()
    logger.info('模型加载完毕:\n{}'.format(model))
    logger.warn('获取词典')
    char_vocab = load_serialize_obj(char_vocab_pkl_file)
    logger.info('char_vocab:{}'.format(char_vocab))
    target_vocab = load_serialize_obj(target_vocab_pkl_file)
    logger.info('target_vocab:{}'.format(target_vocab))
    logger.warn('加载测试数据')