def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default="BILSTM", help="model in [HMM,CRF,BILSTM,BILSTM-CRF,ENSEMBLE]", type=str) params = vars(parser.parse_args()) """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) if params["model"] == "HMM": # 训练评估HMM模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) elif params["model"] == "CRF": # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) elif params["model"] == "BILSTM": # 训练评估BI-LSTM模型 print("正在训练评估BI-LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) elif params["model"] == "BILSTM-CRF": print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) elif params["model"] == "ENSEMBLE": # 模型集成 print()
def main(): """训练模型,评估结果""" # 读取数据 print("读取数据...") data_folder = "./data123" train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train", data_dir=data_folder) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir=data_folder) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir=data_folder) # 训练评估hmm模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(): """模型训练与评估""" # 读取数据 print("读取数据中...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) #训练并评估hmm模型 print("正在训练评估HMM模型") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练并评估crf模型 crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) #训练并评估bilstm模型 bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) print(' '.join([i[0] for i in crf_tag2id.items()])) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists)
def main(args): """训练模型,评估结果""" output_directory = os.path.join('ckpts', args.name) if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) shutil.copy2('models/config.py', output_directory) # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train", fix_length=-1) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估HMM模型 print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval( (train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id, output_directory ) # 训练评估CRF模型 print("正在训练评估CRF模型...") crf_pred = crf_train_eval( (train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), output_directory ) # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, output_directory, crf=False ) print("正在训练评估Bi-LSTM+CRF模型...") # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists ) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists ) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True ) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, output_directory ) ensemble_evaluate( [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred], test_tag_lists )
def main(): """Training model and evaluating results!""" # selecting model do_hmm_in_main = False do_crf_in_main = False do_bilstm_in_main = False do_bilstmcrf_in_main = True do_ensemble_in_main = False ensemble_model_list = [] # Data print("Reading data:") ner_data_dir = "./datasets/FA_NER_Data_IOB" train_word_lists, train_tag_lists, word2id, tag2id = build_corpus( "train", data_dir=ner_data_dir) dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir=ner_data_dir) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False, data_dir=ner_data_dir) print("len(train_word_lists):", len(train_word_lists)) print("len(word2id=vocab):", len(word2id)) if do_hmm_in_main: # Training and Evaluating HMM model print("Training and Evaluating HMM model:") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) ensemble_model_list.append(hmm_pred) if do_crf_in_main: # Training and evaluating CRF model print("Training and evaluating CRF model:") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) ensemble_model_list.append(crf_pred) if do_bilstm_in_main: # Training and evaluating BI-LSTM model print("Training and evaluating Bi-LSTM model:") # We need to put 'PAD' and 'UNK' in word2id and tag2id, when we train LSTM model. bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) ensemble_model_list.append(lstm_pred) if do_bilstmcrf_in_main: # Training and evaluating Bi-LSTM+CRF model print("Training and evaluating Bi-LSTM-CRF model:") # We need to add <start> and <end>, when we use lstm model with CRF (will be used during decoder processing). crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # data processing train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, remove_O=False, reload_model=True) ensemble_model_list.append(lstmcrf_pred) if do_ensemble_in_main: ensemble_evaluate(ensemble_model_list, test_tag_lists)
def main(): import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('--hmm', action='store_true', default=False, help='Train HMM') parser.add_argument('--crf', action='store_true', default=False, help='Train CRF') parser.add_argument('--bilstm', action='store_true', default=False, help='Train BiLSTM') parser.add_argument('--bilstm-crf', action='store_true', default=False, help='Train BiLSTM-CRF') parser.add_argument('--cbow', action='store_true', default=False, help='Train or use CBOW embedding for BiLSTM-CRF') args = parser.parse_args() """训练模型,评估结果""" # 读取数据 print("读取数据...") train_word_lists, train_tag_lists, word2id, tag2id = \ build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False) test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False) # 训练评估hmm模型 if args.hmm: print("正在训练评估HMM模型...") hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists), word2id, tag2id) # 训练评估CRF模型 if args.crf: print("正在训练评估CRF模型...") crf_pred = crf_train_eval((train_word_lists, train_tag_lists), (test_word_lists, test_tag_lists)) if args.bilstm: # 训练评估BI-LSTM模型 print("正在训练评估双向LSTM模型...") # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id, crf=False) # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) if args.bilstm_crf: print("正在训练评估Bi-LSTM+CRF模型...") cbow_emb = None if args.cbow: print('Loading CBOW model') cbow_model = load_model('ckpts/cbow.pkl') cbow_emb = cbow_model.model.lookup_embedding() del cbow_model lstmcrf_pred = bilstm_train_and_eval( (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id, cbow_emb=cbow_emb) elif args.cbow: print("正在训练CBOW模型...") cbow.CBOW_Model(len(crf_word2id)).train(train_word_lists, crf_word2id)