def HMM_test_standard(if_train=True): model_is_existed = os.path.exists(ModelPathConfig.hmm_standard) print("upload data!") word_lists, tag_lists, word2id, tag2id = build_corpus( "train", data_dir=os.path.join(os.getcwd(), "data", 'ResumeNER')) test_word_lists, test_tag_lists, _, _ = build_corpus( "test", data_dir=os.path.join(os.getcwd(), "data", 'ResumeNER')) hmm_model = HMM_standard(len(tag2id), len(word2id)) if if_train or not model_is_existed: print("start to training") hmm_model.train(word_lists, tag_lists, word2id, tag2id) print("save the model") save_model(hmm_model, ModelPathConfig.hmm_standard) else: print("load model") hmm_model = load_model(ModelPathConfig.hmm_standard) pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id) label_tag_lists = test_tag_lists units = evaluate_entity_label(pred_tag_lists, label_tag_lists, list(tag2id.keys())) df = unitstopd(units) df.to_csv(ResultPathConfig.hmm_entity_standard) print(tabulate(df, headers='keys', tablefmt='psql')) units = evaluate_single_label(pred_tag_lists, label_tag_lists, list(tag2id.keys())) df = unitstopd(units) df.to_csv(ResultPathConfig.hmm_model_standard) print(tabulate(df, headers='keys', tablefmt='psql'))
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) mode = 'train' if mode == 'train': # # 训练评估hmm模型 # print("正在训练评估HMM模型...") # hmm_pred = hmm_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists), # word2id, # tag2id # ) # # 训练评估CRF模型 # print("正在训练评估CRF模型...") # crf_pred = crf_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists) # ) # # 训练评估BI-LSTM模型 # print("正在训练评估双向LSTM模型...") # # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) # lstm_pred = bilstm_train_and_eval( # (train_word_lists, train_tag_lists), # (dev_word_lists, dev_tag_lists), # (test_word_lists, test_tag_lists), # bilstm_word2id, bilstm_tag2id, # crf=False # ) print("正在训练评估Bi-LSTM+CRF模型...") lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) elif mode == 'generate': lstmcrf_pred = bilstm_eval((test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) print(lstmcrf_pred)
def bilstm_crf_test(if_train=False): model_is_existed = os.path.exists(ModelPathConfig.bilstm_crf) print("upload data!") word_lists, tag_lists, word2id, tag2id = build_corpus("train") test_word_lists, test_tag_lists, _, _ = build_corpus("test") labels = list(tag2id.keys()) dev_indices = random.sample(range(len(word_lists)), len(word_lists) // 5) train_indices = [i for i in range(len(word_lists)) if i not in dev_indices] dev_word_lists = [word_lists[ind] for ind in dev_indices] dev_tag_lists = [tag_lists[ind] for ind in dev_indices] train_word_lists = [word_lists[ind] for ind in train_indices] train_tag_lists = [tag_lists[ind] for ind in train_indices] test_word_lists, test_tag_lists = add_label_for_lstmcrf(test_word_lists, test_tag_lists, test=True) bilstm_crf_word2id, bilstm_crf_tag2id = extend_map(word2id, tag2id, crf=True) if if_train or not model_is_existed: print('start to training') train_word_lists, train_tag_lists = add_label_for_lstmcrf( train_word_lists, train_tag_lists, test=False) dev_word_lists, dev_tag_lists = add_label_for_lstmcrf(dev_word_lists, dev_tag_lists, test=False) # sample_print_test(train_word_lists,train_tag_lists) start = datetime.now() vocab_size = len(bilstm_crf_word2id) out_size = len(tag2id) bilstm_model = BiLSTM_CRF_Model(vocab_size, out_size, crf=True) bilstm_model.train(train_word_lists,train_tag_lists,\ bilstm_crf_word2id,bilstm_crf_tag2id,dev_word_lists,dev_tag_lists) deltatime = datetime.now() - start print("Training is finished, {} second".format(deltatime.seconds)) save_model(bilstm_model, ModelPathConfig.bilstm_crf) print("Save the model") else: print("load model") bilstm_model = load_model(ModelPathConfig.bilstm_crf) print("test the model") pred_tag_lists, label_tag_lists, = bilstm_model.test( test_word_lists, test_tag_lists, bilstm_crf_word2id, bilstm_crf_tag2id) units = evaluate_entity_label(pred_tag_lists, label_tag_lists, labels) df = unitstopd(units) df.to_csv(ResultPathConfig.bilstm_crf_entity) print(tabulate(df, headers='keys', tablefmt='psql')) units = evaluate_single_label(pred_tag_lists, label_tag_lists, labels) df = unitstopd(units) df.to_csv(ResultPathConfig.bilstm_crf_model) print(tabulate(df, headers='keys', tablefmt='psql'))
def main(): print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) word_string = "傅城州,博士,广东药科大学医药信息工程学院教师,中国计算机学会会员,中国计算机学会青年计算机科技论坛(YOCSEF)广州分论坛AC委员、学术秘书。本科和硕士(推免)毕业于华南师范大学计算机学院软件工程专业,2017年6月获得华南师范大学服务计算理论与技术理学博士学位(导师:汤庸教授)。" word_list = [] test_word_lists = [] for word in word_string: word_list.append(word) test_word_lists.append(word_list) print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning # test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( # test_word_lists, test_tag_lists, test=True # ) print(crf_tag2id) id2tag = {v: k for k, v in crf_tag2id.items()} lstmcrf_pred = bilstm_model.test(test_word_lists, crf_word2id, crf_tag2id) for word_list, tag_list in zip(test_word_lists, lstmcrf_pred): for word, tag in zip(word_list, tag_list): print(word, " ", tag.item(), " ", id2tag[tag.item()]) print(lstmcrf_pred)
def bert_test(): model_is_exitsed = os.path.exists(ModelPathConfig.bert) print("upload data!") word_lists, tag_lists, word2id, tag2id = build_corpus("train") test_word_lists, test_tag_lists, _, _ = build_corpus("test") labels = list(tag2id.keys()) dev_indices = random.sample(range(len(word_lists)), len(word_lists) // 5) train_indices = [i for i in range(len(word_lists)) if i not in dev_indices] dev_word_lists = [word_lists[ind] for ind in dev_indices] dev_tag_lists = [tag_lists[ind] for ind in dev_indices] train_word_lists = [word_lists[ind] for ind in train_indices] train_tag_lists = [tag_lists[ind] for ind in train_indices] bert_tag2id = extend_map_bert(tag2id) if not model_is_exitsed: print('start to training') start = datetime.now() vocab_size = len(word2id) out_size = len(bert_tag2id) bert_model = BERT_Model(vocab_size, out_size) bert_model.train(train_word_lists,train_tag_lists,\ word2id,bert_tag2id,dev_word_lists,dev_tag_lists) deltatime = datetime.now() - start print("Training is finished, {} second".format(deltatime.seconds)) try: print("Save the model") save_model(bert_model, ModelPathConfig.bert) except: print("fail to save model") else: try: print("load model") bert_model = load_model(ModelPathConfig.bert) except: print("fail to load model") sys.exit(0) print("test the model") pred_tag_lists = bert_model.test(test_word_lists, test_tag_lists, word2id, bert_tag2id) label_tag_lists = test_tag_lists units = evaluate_entity_label(pred_tag_lists, label_tag_lists, labels) df = unitstopd(units) df.to_csv(ResultPathConfig.bert_entity) print(tabulate(df, headers='keys', tablefmt='psql')) units = evaluate_single_label(pred_tag_lists, label_tag_lists, labels) df = unitstopd(units) df.to_csv(ResultPathConfig.bert_model) print(tabulate(df, headers='keys', tablefmt='psql'))
def main(): print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id) start = time.time() vocab_size = len(word2id) out_size = len(tag2id) bilstm_model = BILSTM_Model(vocab_size, out_size) bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id) model_name = "bilstm" save_model(bilstm_model, "./save/" + model_name + ".pkl") # print("训练完毕,共用时{}秒.".format(int(time.time() - start))) # print("评估{}模型中...".format(model_name)) pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists, test_tag_lists, word2id, tag2id) print("cal the res...") metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=False) metrics.haha()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default="BILSTM", help="model in [HMM,CRF,BILSTM,BILSTM-CRF,ENSEMBLE]", type=str) params = vars(parser.parse_args()) """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if params["model"] == "HMM": # 训练评估HMM模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) elif params["model"] == "CRF": # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) elif params["model"] == "BILSTM": # 训练评估BI-LSTM模型 print("正在训练评估BI-LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) elif params["model"] == "BILSTM-CRF": print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) elif params["model"] == "ENSEMBLE": # 模型集成 print()
def main(): print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) dev_word_lists_, dev_word_lists_raw, article_id = loadDevFile("development_2.txt") print("加载并评估hmm模型...") hmm_model = load_model(HMM_MODEL_PATH) #hmm_pred = hmm_model.test(test_word_lists, # word2id, # tag2id) hmm_pred_dev = hmm_model.test(dev_word_lists_, word2id, tag2id) output_pred(hmm_pred_dev, article_id, dev_word_lists_raw) metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O) metrics.report_scores() # 打印每个标记的精确度、召回率、f1分数 metrics.report_confusion_matrix() # 打印混淆矩阵 # 加载并评估CRF模型 print("加载并评估crf模型...") crf_model = load_model(CRF_MODEL_PATH) crf_pred = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() # bilstm模型 print("加载并评估bilstm模型...") bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) bilstm_model = load_model(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id) metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True ) lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() ensemble_evaluate( [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists )
def main(): print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) # test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("train", make_vocab=False) # print("加载并评估hmm模型...") # hmm_model = load_model(HMM_MODEL_PATH) # hmm_pred = hmm_model.test(test_word_lists, # word2id, # tag2id) # metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O) # metrics.report_scores() # 打印每个标记的精确度、召回率、f1分数 # metrics.report_confusion_matrix() # 打印混淆矩阵 # 加载并评估CRF模型 # print("加载并评估crf模型...") # crf_model = load_model(CRF_MODEL_PATH) # crf_pred = crf_model.test(test_word_lists) # metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) # metrics.report_scores() # metrics.report_confusion_matrix() # bilstm模型 # print("加载并评估bilstm模型...") # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) # bilstm_model = load_model(BiLSTM_MODEL_PATH) # bilstm_model.model.bilstm.flatten_parameters() # remove warning # lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, # bilstm_word2id, bilstm_tag2id) # metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) # metrics.report_scores() # metrics.report_confusion_matrix() print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists = test_word_lists[:10] test_tag_lists = test_tag_lists[:10] test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) print(target_tag_list) print(lstmcrf_pred) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix()
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") data_folder = "./data123" train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir=data_folder) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir=data_folder) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir=data_folder) # 训练评估hmm模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): print("Read data...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) print("Load and evaluate the hmm model...") hmm_model = load_model(HMM_MODEL_PATH) hmm_pred = hmm_model.test(test_word_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O) metrics.report_scores( ) # Print the accuracy of each mark, recall rate, f1 score metrics.report_confusion_matrix() #Print confusion matrix # Load and evaluate the CRF model print("Load and evaluate the crf model...") crf_model = load_model(CRF_MODEL_PATH) crf_pred = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() # bilstm Model print("Load and evaluate the bilstm model...") bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) bilstm_model = load_model(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id) metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() print("Load and evaluate the bilstm+crf model...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): """模型训练与评估""" # 读取数据 print("读取数据中...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) #训练并评估hmm模型 print("正在训练评估HMM模型") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练并评估crf模型 crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) #训练并评估bilstm模型 bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) print(' '.join([i[0] for i in crf_tag2id.items()])) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): """训练模型,评估结果""" text = ''' #### 没有使用老师提供的数据集,O标签太多(占比92.77%),模型训练效果不好 新数据集取自 https://github.com/luopeixiang/named_entity_recognition ####''' print(text, '\n') # 读取数据 print("读取数据...\n") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估CRF模型 print("训练并评估CRF模型...\n") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists))
def Test_build_corpus(): word_lists, tag_lists, word2id, tag2id = build_corpus('train') print("Training item number:{},tag number :{}, word number:{}"\ .format(len(word_lists),len(tag2id),len(word2id))) print("Data Example:") for i in range(10): word_list = word_lists[i] tag_list = tag_lists[i] sent = ''.join(word_list) tag_sent = ''.join(tag_list) print(sent) print(tag_sent) print(tag2id)
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir="./Drug") # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir="./Drug") test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir="./Drug") # # 训练评估hmm模型 # print("正在训练评估HMM模型...") # hmm_pred = hmm_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists), # word2id, # tag2id, # remove_O = True # ) # # 评估hmm模型 # print("正在评估HMM模型...") # hmm_model = load_model("./ckpts/hmm.pkl") # hmm_pred = hmm_eval( # hmm_model, # (test_word_lists, test_tag_lists), # word2id, # tag2id, # remove_O = True # ) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists))
def main(): print('读取数据...') train_word_lists, train_tag_lists, word2id, tag2id = build_corpus('train') dev_word_lists, dev_tag_lists = build_corpus('dev', maek_vocab = False) test_word_lists, test_tag_lists = build_corpus('test', maek_vocab = False) print('训练HMM模型...') hmm_model = HMMModel(len(tag2id), len(word2id)) hmm_model.train(train_word_lists, train_tag_lists, word2id, tag2id) pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores() print('训练CRF模型...') crf_model = CRFModel(max_iterations = 90) crf_model.train(train_word_lists, train_tag_lists) pred_tag_lists = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores() print('训练BiLSTM模型...') word2id, tag2id = extend_maps(word2id, tag2id) bilstm = BiLSTM(len(word2id), len(tag2id)) bilstm.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8) bilstm.dev_test(test_word_lists, test_tag_lists, word2id, tag2id) bilstm.close_sess() print('训练BiLSTM-CRF模型...') bilstm_crf = BiLSTM_CRF(len(word2id), len(tag2id)) bilstm_crf.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8) bilstm_crf.dev_test(test_word_lists, test_tag_lists, word2id, tag2id) bilstm_crf.close_sess()
def main(corpus_dir, test_file): # 读取数据 # print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir=corpus_dir) test_word_lists, test_tag_lists = build_corpus_test(test_file) # print(test_word_lists, test_tag_lists) # print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) for i in range(len(lstmcrf_pred)): for j in range(len(lstmcrf_pred[i])): print(test_word_lists[i][j] + "\t" + lstmcrf_pred[i][j]) print()
help='number of epochs') args = parser.parse_args() ## maybe load the model... if os.path.exists(args.save_dir): print("Loading Model...") model, vocab, index, metadata = load_model(args.save_dir) ## count parameters n_params = sum(np.prod(p.shape) for p in model.parameters()) print("Using Model with %i Parameters" % n_params) print("Loading Dataset...") ## build training pipeline dataset = load_dataset(args.data_dir, tokenize=True) dataset = {k: v[1024:-4096] for k, v in dataset.items()} corpus = build_corpus(dataset, index, min_length=args.seq) pipeline = build_pipeline(corpus, n_seq=args.seq, n_batch=args.batch) print("Using %i Documents and %i Words" % (len(corpus), len(vocab))) else: print("Loading Dataset...") ## load data and build vocabulary dataset = load_dataset(args.data_dir, tokenize=True) dataset = {k: v[1024:-4096] for k, v in dataset.items()} vocab, index, lookup = build_vocab(dataset, args.term_freq, args.doc_freq) ## build corpus corpus = build_corpus(dataset, index, min_length=args.seq) ## build training pipeline pipeline = build_pipeline(corpus, n_seq=args.seq, n_batch=args.batch) ## save vocab
def main(): print("读取数据...") train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists, word2id, data2id = build_corpus( "train") dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = build_corpus( "dev", make_vocab=False) test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = build_corpus( "test", make_vocab=False) # # 训练评估hmm模型 # print("正在训练评估HMM模型...") # hmm_pred = hmm_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists), # word2id, # tag2id # ) # # 训练评估CRF模型 # print("正在训练评估CRF模型...") # crf_pred = crf_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists) # ) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_data2id = extend_maps(word2id, data2id, for_crf=False) lstm_pred = bilstm_train_and_eval( (train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists), (dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists), (test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists), bilstm_word2id, bilstm_data2id, crf=False)
''' @Autor: xujiahuan @Date: 2020-05-06 11:00:17 @LastEditors: xujiahuan @LastEditTime: 2020-05-17 15:26:13 ''' from sklearn.externals import joblib from data import build_corpus from utils import extend_maps train_path = 'data/train.txt' # dev_path = 'data/dev.txt' # test_path = 'data/test.txt' train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus(train_path) # dev_word_lists, dev_tag_lists = build_corpus(dev_path, make_vocab=False) # test_word_lists, test_tag_lists = build_corpus(test_path, make_vocab=False) bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) test_word_lists = [['上', '海', '浦', '东']] test_tag_lists = [] length = len(test_word_lists[0]) temp = ['O' for i in range(length)] test_tag_lists.append(temp) def rnn_pred2(test_word_lists, test_tag_lists): # start = time.time() # vocab_size = len(bilstm_word2id) # out_size = len(bilstm_tag2id) path = "ckpts/rnn.pkl"
from utils import load_model, extend_maps, prepocess_data_for_lstmcrf from data import build_corpus import glob HMM_MODEL_PATH = './ckpts/hmm.pkl' CRF_MODEL_PATH = './ckpts/crf.pkl' BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl' BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl' print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning for item in glob.glob("data/chusai_xuanshou/" + "*.txt"): with open(item, encoding="utf-8") as f: origin_text = "".join(f.readlines()) test_word_list = list(origin_text) test_word_lists = [test_word_list] test_tag_list = ["N" for _ in test_word_list] test_tag_lists = [test_tag_list] test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True ) lstmcrf_pred = bilstm_model.testA(test_word_lists, crf_word2id, crf_tag2id) print(test_word_list)
def main(): print("读取数据...") train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists, word2id, data2id = build_corpus( "train") dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = build_corpus( "dev", make_vocab=False) test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = build_corpus( "test", make_vocab=False) # bilstm模型 print("加载并评估bilstm模型...") bilstm_word2id, bilstm_data2id = extend_maps(word2id, data2id, for_crf=False) bilstm_model = load_model(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test( test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists, bilstm_word2id, bilstm_data2id) allnum = 0 correct = 0 f = open('test.tgt.dataptr1', 'w') for pred, gold in zip(lstm_pred, target_tag_list): pred = pred.cpu().numpy().tolist()[:len(gold)] f.write(' '.join([str(x) for x in pred]) + '\n') for x, y in zip(pred, gold): if x == y: correct += 1 allnum += 1 f.close() # TODO print(correct / allnum)
def main_rep1(x, y): if x == 'train': # select data according to args.process print("Read data...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) ###### if y == 'crf': crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) ensemble_evaluate([crf_pred], test_tag_lists) elif y == 'bilstm': bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) ensemble_evaluate([lstm_pred], test_tag_lists) elif y == 'bilstm-crf': crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # more data processing train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([lstmcrf_pred], test_tag_lists) else: HMM_MODEL_PATH = './ckpts/hmm.pkl' CRF_MODEL_PATH = './ckpts/crf.pkl' BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl' BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl' REMOVE_O = False # Whether to remove the O mark at the time of evaluation # select data according to args.process print("Read data...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if y == 'crf': crf_model = load_model_1(CRF_MODEL_PATH) crf_pred = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() elif y == 'bilstm': bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) bilstm_model = load_model_1(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test( test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id) metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() elif y == 'bilstm-crf': crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model_1(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters( ) # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test( test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() exit()
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) #test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) test_word_lists, test_tag_lists, article_id = loadDevFile( "development_2.txt") # # 训练评估hmm模型 # # print("正在训练评估HMM模型...") # # hmm_pred = hmm_train_eval( # # (train_word_lists, train_tag_lists), # # (dev_word_lists_, test_tag_lists), # # word2id, # # tag2id, # # remove_O=True # # ) # # output_pred(hmm_pred, article_id, dev_word_lists_raw) # # 训练评估CRF模型 # print("正在训练评估CRF模型...") # crf_pred = crf_train_eval( # (train_word_lists, train_tag_lists), # (dev_word_lists_, test_tag_lists), # remove_O=True # ) # output_pred(crf_pred, article_id, dev_word_lists_raw,output_path = 'output_crf.tsv') # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False, remove_O=True) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, remove_O=True) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # print("加载并评估hmm模型...") # hmm_model = load_model(HMM_MODEL_PATH) # hmm_pred = hmm_model.test(test_word_lists, # word2id, # tag2id) # metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O) # metrics.report_scores() # 打印每个标记的精确度、召回率、f1分数 # metrics.report_confusion_matrix() # 打印混淆矩阵 # # # 加载并评估CRF模型 # print("加载并评估crf模型...") # crf_model = load_model(CRF_MODEL_PATH) # crf_pred = crf_model.test(test_word_lists) # metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) # metrics.report_scores() # metrics.report_confusion_matrix() # # # bilstm模型 # print("加载并评估bilstm模型...") # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) # bilstm_model = load_model(BiLSTM_MODEL_PATH) # bilstm_model.model.bilstm.flatten_parameters() # remove warning # lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, # bilstm_word2id, bilstm_tag2id) # metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) # metrics.report_scores() # metrics.report_confusion_matrix() print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() # ensemble_evaluate( # [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], # test_tag_lists # ) ls = ['B-SYM', 'M-SYM', 'E-SYM'] selected = [ i for i in range(len(test_tag_lists[0])) if test_tag_lists[0][i] in ls ] selected_word = [test_word_lists[0][i] for i in selected] selected_predict = [ i for i in range(len(lstmcrf_pred[0])) if lstmcrf_pred[0][i] in ls ] selected_predict_word = [test_word_lists[0][i] for i in selected_predict] for tag_list, doc in zip(train_tag_lists, train_word_lists): selected_train = [i for i in range(len(tag_list)) if tag_list[i] in ls] selected_train_word = [doc[i] for i in selected_train] # print(selected_train_word) print('preditct list:', lstmcrf_pred) print('target list:', target_tag_list) print(selected_word) print(selected_predict_word)
def main(): import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('--hmm', action='store_true', default=False, help='Test HMM') parser.add_argument('--crf', action='store_true', default=False, help='Test CRF') parser.add_argument('--bilstm', action='store_true', default=False, help='Test BiLSTM') parser.add_argument('--bilstm-crf', action='store_true', default=False, help='Test BiLSTM-CRF') parser.add_argument('--cbow', action='store_true', default=False, help='Use CBOW embedding for BiLSTM-CRF') args = parser.parse_args() print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if args.hmm: print("加载并评估hmm模型...") hmm_model = load_model(HMM_MODEL_PATH) hmm_pred = hmm_model.test(test_word_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O) metrics.report_scores() # 打印每个标记的精确度、召回率、f1分数 metrics.report_confusion_matrix() # 打印混淆矩阵 # 加载并评估CRF模型 if args.crf: print("加载并评估crf模型...") crf_model = load_model(CRF_MODEL_PATH) crf_pred = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() # bilstm模型 if args.bilstm: print("加载并评估bilstm模型...") bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) bilstm_model = load_model(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id) metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() if args.bilstm_crf: print("加载并评估bilstm+crf模型...") crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters() # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test( test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix()
''' @Autor: xujiahuan @Date: 2020-04-21 20:12:02 @LastEditors: xujiahuan @LastEditTime: 2020-05-19 19:55:44 ''' from data import build_corpus from models.crf import CRFModel from metrics import Metrics from utils import save_model # 制作数据 train_path = 'data/train.txt' dev_path = 'data/dev.txt' test_path = 'data/test.txt' train_word_lists, train_tag_lists = build_corpus(train_path, make_vocab=False) dev_word_lists, dev_tag_lists = build_corpus(dev_path, make_vocab=False) test_word_lists, test_tag_lists = build_corpus(test_path, make_vocab=False) def crf_pred(train_word_lists, train_tag_lists, test_word_lists, test_tag_lists): model = CRFModel() model.train(train_word_lists, train_tag_lists) save_model(model, "./ckpts/crf.pkl") print(test_word_lists) pred = model.test(test_word_lists) return pred print("正在训练CRF...")
def main(): import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('--hmm', action='store_true', default=False, help='Train HMM') parser.add_argument('--crf', action='store_true', default=False, help='Train CRF') parser.add_argument('--bilstm', action='store_true', default=False, help='Train BiLSTM') parser.add_argument('--bilstm-crf', action='store_true', default=False, help='Train BiLSTM-CRF') parser.add_argument('--cbow', action='store_true', default=False, help='Train or use CBOW embedding for BiLSTM-CRF') args = parser.parse_args() """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估hmm模型 if args.hmm: print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 if args.crf: print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) if args.bilstm: # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) if args.bilstm_crf: print("正在训练评估Bi-LSTM+CRF模型...") cbow_emb = None if args.cbow: print('Loading CBOW model') cbow_model = load_model('ckpts/cbow.pkl') cbow_emb = cbow_model.model.lookup_embedding() del cbow_model lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, cbow_emb=cbow_emb) elif args.cbow: print("正在训练CBOW模型...") cbow.CBOW_Model(len(crf_word2id)).train(train_word_lists, crf_word2id)
default=2) parser.add_argument('--emb_path', dest='embed_path', help='embedding path', default='/home/hjp/Downloads/msc/glove.6B.50d.txt') parser.add_argument('--data_path', dest='data_path', help='data set path', default='/home/hjp/Downloads/msc/') args = parser.parse_args() torch.manual_seed(args.seed) embed, vocab = data.build_embed(args.embed_path) print torch.from_numpy(embed['world']) word2idx, idx2word, trainlbl, testlbl = data.build_corpus(args.data_path) print word2idx['chief'] trainlabel = torch.FloatTensor(torch.zeros(len(trainlbl), 1)) print trainlabel for i in range(len(trainlbl)): print trainlbl[i] if trainlbl[i] == '1': trainlabel[i] = 1 else: trainlabel[i] = 0 print trainlabel testlabel = torch.FloatTensor(torch.zeros(len(testlbl), 1)) print testlabel for i in range(len(testlbl)):