def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default="BILSTM", help="model in [HMM,CRF,BILSTM,BILSTM-CRF,ENSEMBLE]", type=str) params = vars(parser.parse_args()) """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if params["model"] == "HMM": # 训练评估HMM模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) elif params["model"] == "CRF": # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) elif params["model"] == "BILSTM": # 训练评估BI-LSTM模型 print("正在训练评估BI-LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) elif params["model"] == "BILSTM-CRF": print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) elif params["model"] == "ENSEMBLE": # 模型集成 print()
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") data_folder = "./data123" train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir=data_folder) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir=data_folder) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir=data_folder) # 训练评估hmm模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): """模型训练与评估""" # 读取数据 print("读取数据中...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) #训练并评估hmm模型 print("正在训练评估HMM模型") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练并评估crf模型 crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) #训练并评估bilstm模型 bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) print(' '.join([i[0] for i in crf_tag2id.items()])) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): """训练模型,评估结果""" text = ''' #### 没有使用老师提供的数据集,O标签太多(占比92.77%),模型训练效果不好 新数据集取自 https://github.com/luopeixiang/named_entity_recognition ####''' print(text, '\n') # 读取数据 print("读取数据...\n") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估CRF模型 print("训练并评估CRF模型...\n") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists))
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir="./Drug") # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir="./Drug") test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir="./Drug") # # 训练评估hmm模型 # print("正在训练评估HMM模型...") # hmm_pred = hmm_train_eval( # (train_word_lists, train_tag_lists), # (test_word_lists, test_tag_lists), # word2id, # tag2id, # remove_O = True # ) # # 评估hmm模型 # print("正在评估HMM模型...") # hmm_model = load_model("./ckpts/hmm.pkl") # hmm_pred = hmm_eval( # hmm_model, # (test_word_lists, test_tag_lists), # word2id, # tag2id, # remove_O = True # ) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists))
def main(args): """训练模型,评估结果""" output_directory = os.path.join('ckpts', args.name) if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) shutil.copy2('models/config.py', output_directory) # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train", fix_length=-1) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估HMM模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval( (train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id, output_directory ) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval( (train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), output_directory ) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, output_directory, crf=False ) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists ) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists ) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True ) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, output_directory ) ensemble_evaluate( [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists )
def main(): """Training model and evaluating results!""" # selecting model do_hmm_in_main = False do_crf_in_main = False do_bilstm_in_main = False do_bilstmcrf_in_main = True do_ensemble_in_main = False ensemble_model_list = [] # Data print("Reading data:") ner_data_dir = "./datasets/FA_NER_Data_IOB" train_word_lists, train_tag_lists, word2id, tag2id = build_corpus( "train", data_dir=ner_data_dir) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir=ner_data_dir) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir=ner_data_dir) print("len(train_word_lists):", len(train_word_lists)) print("len(word2id=vocab):", len(word2id)) if do_hmm_in_main: # Training and Evaluating HMM model print("Training and Evaluating HMM model:") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) ensemble_model_list.append(hmm_pred) if do_crf_in_main: # Training and evaluating CRF model print("Training and evaluating CRF model:") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) ensemble_model_list.append(crf_pred) if do_bilstm_in_main: # Training and evaluating BI-LSTM model print("Training and evaluating Bi-LSTM model:") # We need to put 'PAD' and 'UNK' in word2id and tag2id, when we train LSTM model. bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) ensemble_model_list.append(lstm_pred) if do_bilstmcrf_in_main: # Training and evaluating Bi-LSTM+CRF model print("Training and evaluating Bi-LSTM-CRF model:") # We need to add <start> and <end>, when we use lstm model with CRF (will be used during decoder processing). crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # data processing train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, remove_O=False, reload_model=True) ensemble_model_list.append(lstmcrf_pred) if do_ensemble_in_main: ensemble_evaluate(ensemble_model_list, test_tag_lists)
def main(): import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('--hmm', action='store_true', default=False, help='Train HMM') parser.add_argument('--crf', action='store_true', default=False, help='Train CRF') parser.add_argument('--bilstm', action='store_true', default=False, help='Train BiLSTM') parser.add_argument('--bilstm-crf', action='store_true', default=False, help='Train BiLSTM-CRF') parser.add_argument('--cbow', action='store_true', default=False, help='Train or use CBOW embedding for BiLSTM-CRF') args = parser.parse_args() """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估hmm模型 if args.hmm: print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 if args.crf: print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) if args.bilstm: # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) if args.bilstm_crf: print("正在训练评估Bi-LSTM+CRF模型...") cbow_emb = None if args.cbow: print('Loading CBOW model') cbow_model = load_model('ckpts/cbow.pkl') cbow_emb = cbow_model.model.lookup_embedding() del cbow_model lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, cbow_emb=cbow_emb) elif args.cbow: print("正在训练CBOW模型...") cbow.CBOW_Model(len(crf_word2id)).train(train_word_lists, crf_word2id)
def main_rep1(x, y): if x == 'train': # select data according to args.process print("Read data...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) ###### if y == 'crf': crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) ensemble_evaluate([crf_pred], test_tag_lists) elif y == 'bilstm': bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) ensemble_evaluate([lstm_pred], test_tag_lists) elif y == 'bilstm-crf': crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # more data processing train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([lstmcrf_pred], test_tag_lists) else: HMM_MODEL_PATH = './ckpts/hmm.pkl' CRF_MODEL_PATH = './ckpts/crf.pkl' BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl' BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl' REMOVE_O = False # Whether to remove the O mark at the time of evaluation # select data according to args.process print("Read data...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if y == 'crf': crf_model = load_model_1(CRF_MODEL_PATH) crf_pred = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() elif y == 'bilstm': bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) bilstm_model = load_model_1(BiLSTM_MODEL_PATH) bilstm_model.model.bilstm.flatten_parameters() # remove warning lstm_pred, target_tag_list = bilstm_model.test( test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id) metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() elif y == 'bilstm-crf': crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) bilstm_model = load_model_1(BiLSTMCRF_MODEL_PATH) bilstm_model.model.bilstm.bilstm.flatten_parameters( ) # remove warning test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred, target_tag_list = bilstm_model.test( test_word_lists, test_tag_lists, crf_word2id, crf_tag2id) metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O) metrics.report_scores() metrics.report_confusion_matrix() exit()