コード例 #1
0
ファイル: main.py プロジェクト: OpenNLPhub/ChineseNER
def HMM_test_standard(if_train=True):
    model_is_existed = os.path.exists(ModelPathConfig.hmm_standard)

    print("upload data!")
    word_lists, tag_lists, word2id, tag2id = build_corpus(
        "train", data_dir=os.path.join(os.getcwd(), "data", 'ResumeNER'))
    test_word_lists, test_tag_lists, _, _ = build_corpus(
        "test", data_dir=os.path.join(os.getcwd(), "data", 'ResumeNER'))

    hmm_model = HMM_standard(len(tag2id), len(word2id))

    if if_train or not model_is_existed:
        print("start to training")
        hmm_model.train(word_lists, tag_lists, word2id, tag2id)
        print("save the model")
        save_model(hmm_model, ModelPathConfig.hmm_standard)
    else:
        print("load model")
        hmm_model = load_model(ModelPathConfig.hmm_standard)

    pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id)
    label_tag_lists = test_tag_lists

    units = evaluate_entity_label(pred_tag_lists, label_tag_lists,
                                  list(tag2id.keys()))
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.hmm_entity_standard)
    print(tabulate(df, headers='keys', tablefmt='psql'))

    units = evaluate_single_label(pred_tag_lists, label_tag_lists,
                                  list(tag2id.keys()))
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.hmm_model_standard)
    print(tabulate(df, headers='keys', tablefmt='psql'))
コード例 #2
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)

    mode = 'train'
    if mode == 'train':
        # # 训练评估hmm模型
        # print("正在训练评估HMM模型...")
        # hmm_pred = hmm_train_eval(
        #     (train_word_lists, train_tag_lists),
        #     (test_word_lists, test_tag_lists),
        #     word2id,
        #     tag2id
        # )

        # # 训练评估CRF模型
        # print("正在训练评估CRF模型...")
        # crf_pred = crf_train_eval(
        #     (train_word_lists, train_tag_lists),
        #     (test_word_lists, test_tag_lists)
        # )

        # # 训练评估BI-LSTM模型
        # print("正在训练评估双向LSTM模型...")
        # # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
        # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
        # lstm_pred = bilstm_train_and_eval(
        #     (train_word_lists, train_tag_lists),
        #     (dev_word_lists, dev_tag_lists),
        #     (test_word_lists, test_tag_lists),
        #     bilstm_word2id, bilstm_tag2id,
        #     crf=False
        # )

        print("正在训练评估Bi-LSTM+CRF模型...")

        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id, crf_tag2id)
    elif mode == 'generate':
        lstmcrf_pred = bilstm_eval((test_word_lists, test_tag_lists),
                                   crf_word2id, crf_tag2id)
        print(lstmcrf_pred)
コード例 #3
0
ファイル: main.py プロジェクト: OpenNLPhub/ChineseNER
def bilstm_crf_test(if_train=False):
    model_is_existed = os.path.exists(ModelPathConfig.bilstm_crf)

    print("upload data!")
    word_lists, tag_lists, word2id, tag2id = build_corpus("train")
    test_word_lists, test_tag_lists, _, _ = build_corpus("test")
    labels = list(tag2id.keys())
    dev_indices = random.sample(range(len(word_lists)), len(word_lists) // 5)
    train_indices = [i for i in range(len(word_lists)) if i not in dev_indices]

    dev_word_lists = [word_lists[ind] for ind in dev_indices]
    dev_tag_lists = [tag_lists[ind] for ind in dev_indices]
    train_word_lists = [word_lists[ind] for ind in train_indices]
    train_tag_lists = [tag_lists[ind] for ind in train_indices]
    test_word_lists, test_tag_lists = add_label_for_lstmcrf(test_word_lists,
                                                            test_tag_lists,
                                                            test=True)
    bilstm_crf_word2id, bilstm_crf_tag2id = extend_map(word2id,
                                                       tag2id,
                                                       crf=True)

    if if_train or not model_is_existed:
        print('start to training')
        train_word_lists, train_tag_lists = add_label_for_lstmcrf(
            train_word_lists, train_tag_lists, test=False)
        dev_word_lists, dev_tag_lists = add_label_for_lstmcrf(dev_word_lists,
                                                              dev_tag_lists,
                                                              test=False)

        # sample_print_test(train_word_lists,train_tag_lists)

        start = datetime.now()
        vocab_size = len(bilstm_crf_word2id)
        out_size = len(tag2id)

        bilstm_model = BiLSTM_CRF_Model(vocab_size, out_size, crf=True)
        bilstm_model.train(train_word_lists,train_tag_lists,\
            bilstm_crf_word2id,bilstm_crf_tag2id,dev_word_lists,dev_tag_lists)
        deltatime = datetime.now() - start

        print("Training is finished, {} second".format(deltatime.seconds))
        save_model(bilstm_model, ModelPathConfig.bilstm_crf)
        print("Save the model")
    else:
        print("load model")
        bilstm_model = load_model(ModelPathConfig.bilstm_crf)

    print("test the model")
    pred_tag_lists, label_tag_lists, = bilstm_model.test(
        test_word_lists, test_tag_lists, bilstm_crf_word2id, bilstm_crf_tag2id)

    units = evaluate_entity_label(pred_tag_lists, label_tag_lists, labels)
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.bilstm_crf_entity)
    print(tabulate(df, headers='keys', tablefmt='psql'))

    units = evaluate_single_label(pred_tag_lists, label_tag_lists, labels)
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.bilstm_crf_model)
    print(tabulate(df, headers='keys', tablefmt='psql'))
コード例 #4
0
def main():
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    word_string = "傅城州,博士,广东药科大学医药信息工程学院教师,中国计算机学会会员,中国计算机学会青年计算机科技论坛(YOCSEF)广州分论坛AC委员、学术秘书。本科和硕士(推免)毕业于华南师范大学计算机学院软件工程专业,2017年6月获得华南师范大学服务计算理论与技术理学博士学位(导师:汤庸教授)。"
    word_list = []
    test_word_lists = []
    for word in word_string:
        word_list.append(word)

    test_word_lists.append(word_list)

    print("加载并评估bilstm+crf模型...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    # test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
    #     test_word_lists, test_tag_lists, test=True
    # )
    print(crf_tag2id)
    id2tag = {v: k for k, v in crf_tag2id.items()}
    lstmcrf_pred = bilstm_model.test(test_word_lists, crf_word2id, crf_tag2id)
    for word_list, tag_list in zip(test_word_lists, lstmcrf_pred):
        for word, tag in zip(word_list, tag_list):
            print(word, " ", tag.item(), " ", id2tag[tag.item()])

    print(lstmcrf_pred)
コード例 #5
0
def bert_test():
    model_is_exitsed = os.path.exists(ModelPathConfig.bert)

    print("upload data!")

    word_lists, tag_lists, word2id, tag2id = build_corpus("train")
    test_word_lists, test_tag_lists, _, _ = build_corpus("test")

    labels = list(tag2id.keys())

    dev_indices = random.sample(range(len(word_lists)), len(word_lists) // 5)
    train_indices = [i for i in range(len(word_lists)) if i not in dev_indices]

    dev_word_lists = [word_lists[ind] for ind in dev_indices]
    dev_tag_lists = [tag_lists[ind] for ind in dev_indices]
    train_word_lists = [word_lists[ind] for ind in train_indices]
    train_tag_lists = [tag_lists[ind] for ind in train_indices]

    bert_tag2id = extend_map_bert(tag2id)

    if not model_is_exitsed:
        print('start to training')
        start = datetime.now()
        vocab_size = len(word2id)
        out_size = len(bert_tag2id)
        bert_model = BERT_Model(vocab_size, out_size)
        bert_model.train(train_word_lists,train_tag_lists,\
            word2id,bert_tag2id,dev_word_lists,dev_tag_lists)

        deltatime = datetime.now() - start
        print("Training is finished, {} second".format(deltatime.seconds))
        try:
            print("Save the model")
            save_model(bert_model, ModelPathConfig.bert)
        except:
            print("fail to save model")

    else:
        try:
            print("load model")
            bert_model = load_model(ModelPathConfig.bert)
        except:
            print("fail to load model")
            sys.exit(0)

    print("test the model")
    pred_tag_lists = bert_model.test(test_word_lists, test_tag_lists, word2id,
                                     bert_tag2id)

    label_tag_lists = test_tag_lists

    units = evaluate_entity_label(pred_tag_lists, label_tag_lists, labels)
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.bert_entity)
    print(tabulate(df, headers='keys', tablefmt='psql'))

    units = evaluate_single_label(pred_tag_lists, label_tag_lists, labels)
    df = unitstopd(units)
    df.to_csv(ResultPathConfig.bert_model)
    print(tabulate(df, headers='keys', tablefmt='psql'))
コード例 #6
0
def main():
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id)

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size)
    bilstm_model.train(train_word_lists, train_tag_lists, dev_word_lists,
                       dev_tag_lists, word2id, tag2id)
    model_name = "bilstm"
    save_model(bilstm_model, "./save/" + model_name + ".pkl")

    # print("训练完毕,共用时{}秒.".format(int(time.time() - start)))
    # print("评估{}模型中...".format(model_name))

    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                       test_tag_lists, word2id,
                                                       tag2id)
    print("cal the res...")
    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=False)
    metrics.haha()
コード例 #7
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default="BILSTM",
                        help="model in [HMM,CRF,BILSTM,BILSTM-CRF,ENSEMBLE]",
                        type=str)

    params = vars(parser.parse_args())
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    if params["model"] == "HMM":
        # 训练评估HMM模型
        print("正在训练评估HMM模型...")
        hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists), word2id,
                                  tag2id)
    elif params["model"] == "CRF":
        # 训练评估CRF模型
        print("正在训练评估CRF模型...")
        crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists))
    elif params["model"] == "BILSTM":
        # 训练评估BI-LSTM模型
        print("正在训练评估BI-LSTM模型...")
        # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                          (dev_word_lists, dev_tag_lists),
                                          (test_word_lists, test_tag_lists),
                                          bilstm_word2id,
                                          bilstm_tag2id,
                                          crf=False)
    elif params["model"] == "BILSTM-CRF":
        print("正在训练评估Bi-LSTM+CRF模型...")
        # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
        crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
        # 还需要额外的一些数据处理
        train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
            train_word_lists, train_tag_lists)
        dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
            dev_word_lists, dev_tag_lists)
        test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
            test_word_lists, test_tag_lists, test=True)
        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id, crf_tag2id)
    elif params["model"] == "ENSEMBLE":
        # 模型集成
        print()
コード例 #8
0
def main():
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)
    dev_word_lists_, dev_word_lists_raw, article_id = loadDevFile("development_2.txt")

    print("加载并评估hmm模型...")
    hmm_model = load_model(HMM_MODEL_PATH)
    #hmm_pred = hmm_model.test(test_word_lists,
                              # word2id,
                              # tag2id)
    hmm_pred_dev = hmm_model.test(dev_word_lists_,
                              word2id,
                              tag2id)
    output_pred(hmm_pred_dev, article_id, dev_word_lists_raw)
    metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O)
    metrics.report_scores()  # 打印每个标记的精确度、召回率、f1分数
    metrics.report_confusion_matrix()  # 打印混淆矩阵

    # 加载并评估CRF模型
    print("加载并评估crf模型...")
    crf_model = load_model(CRF_MODEL_PATH)
    crf_pred = crf_model.test(test_word_lists)
    metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    # bilstm模型
    print("加载并评估bilstm模型...")
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    bilstm_model = load_model(BiLSTM_MODEL_PATH)
    bilstm_model.model.bilstm.flatten_parameters()  # remove warning
    lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists,
                                                   bilstm_word2id, bilstm_tag2id)
    metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    print("加载并评估bilstm+crf模型...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True
    )
    lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists,
                                                      crf_word2id, crf_tag2id)
    metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    ensemble_evaluate(
        [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
        test_tag_lists
    )
コード例 #9
0
def main():
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    # test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("train", make_vocab=False)

    # print("加载并评估hmm模型...")
    # hmm_model = load_model(HMM_MODEL_PATH)
    # hmm_pred = hmm_model.test(test_word_lists,
    #                           word2id,
    #                           tag2id)
    # metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O)
    # metrics.report_scores()  # 打印每个标记的精确度、召回率、f1分数
    # metrics.report_confusion_matrix()  # 打印混淆矩阵

    # 加载并评估CRF模型
    # print("加载并评估crf模型...")
    # crf_model = load_model(CRF_MODEL_PATH)
    # crf_pred = crf_model.test(test_word_lists)
    # metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
    # metrics.report_scores()
    # metrics.report_confusion_matrix()

    # bilstm模型
    # print("加载并评估bilstm模型...")
    # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    # bilstm_model = load_model(BiLSTM_MODEL_PATH)
    # bilstm_model.model.bilstm.flatten_parameters()  # remove warning
    # lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists,
    #                                                bilstm_word2id, bilstm_tag2id)
    # metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
    # metrics.report_scores()
    # metrics.report_confusion_matrix()

    print("加载并评估bilstm+crf模型...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    test_word_lists = test_word_lists[:10]
    test_tag_lists = test_tag_lists[:10]
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                      test_tag_lists,
                                                      crf_word2id, crf_tag2id)

    print(target_tag_list)
    print(lstmcrf_pred)

    metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()
コード例 #10
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    data_folder = "./data123"
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train", data_dir=data_folder)
    dev_word_lists, dev_tag_lists = build_corpus("dev",
                                                 make_vocab=False,
                                                 data_dir=data_folder)
    test_word_lists, test_tag_lists = build_corpus("test",
                                                   make_vocab=False,
                                                   data_dir=data_folder)

    # 训练评估hmm模型
    print("正在训练评估HMM模型...")
    hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists), word2id,
                              tag2id)

    # 训练评估CRF模型
    print("正在训练评估CRF模型...")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))

    # 训练评估BI-LSTM模型
    print("正在训练评估双向LSTM模型...")
    # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                      (dev_word_lists, dev_tag_lists),
                                      (test_word_lists, test_tag_lists),
                                      bilstm_word2id,
                                      bilstm_tag2id,
                                      crf=False)

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
コード例 #11
0
def main():
    print("Read data...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    print("Load and evaluate the hmm model...")
    hmm_model = load_model(HMM_MODEL_PATH)
    hmm_pred = hmm_model.test(test_word_lists, word2id, tag2id)
    metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O)
    metrics.report_scores(
    )  # Print the accuracy of each mark, recall rate, f1 score
    metrics.report_confusion_matrix()  #Print confusion matrix

    # Load and evaluate the CRF model
    print("Load and evaluate the crf model...")
    crf_model = load_model(CRF_MODEL_PATH)
    crf_pred = crf_model.test(test_word_lists)
    metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    # bilstm Model
    print("Load and evaluate the bilstm model...")
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    bilstm_model = load_model(BiLSTM_MODEL_PATH)
    bilstm_model.model.bilstm.flatten_parameters()  # remove warning
    lstm_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                   test_tag_lists,
                                                   bilstm_word2id,
                                                   bilstm_tag2id)
    metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    print("Load and evaluate the bilstm+crf model...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                      test_tag_lists,
                                                      crf_word2id, crf_tag2id)
    metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
コード例 #12
0
ファイル: main.py プロジェクト: zxmwd2/chinese-sequence-ner
def main():
    """模型训练与评估"""

    # 读取数据
    print("读取数据中...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    #训练并评估hmm模型
    print("正在训练评估HMM模型")
    hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists), word2id,
                              tag2id)

    # 训练并评估crf模型
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))

    #训练并评估bilstm模型
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                      (dev_word_lists, dev_tag_lists),
                                      (test_word_lists, test_tag_lists),
                                      bilstm_word2id,
                                      bilstm_tag2id,
                                      crf=False)

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    print(' '.join([i[0] for i in crf_tag2id.items()]))
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
コード例 #13
0
ファイル: main.py プロジェクト: meihuaershiqi/NLP
def main():
    """训练模型,评估结果"""

    text = '''
        ####
        没有使用老师提供的数据集,O标签太多(占比92.77%),模型训练效果不好
        新数据集取自
        https://github.com/luopeixiang/named_entity_recognition
        ####'''
    print(text, '\n')
    # 读取数据
    print("读取数据...\n")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 训练评估CRF模型
    print("训练并评估CRF模型...\n")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))
コード例 #14
0
ファイル: dataTest.py プロジェクト: OpenNLPhub/ChineseNER
def Test_build_corpus():
    word_lists, tag_lists, word2id, tag2id = build_corpus('train')
    print("Training item number:{},tag number :{}, word number:{}"\
        .format(len(word_lists),len(tag2id),len(word2id)))
    print("Data Example:")
    for i in range(10):
        word_list = word_lists[i]
        tag_list = tag_lists[i]
        sent = ''.join(word_list)
        tag_sent = ''.join(tag_list)
        print(sent)
        print(tag_sent)
    print(tag2id)
コード例 #15
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train", data_dir="./Drug")
    # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir="./Drug")
    test_word_lists, test_tag_lists = build_corpus("test",
                                                   make_vocab=False,
                                                   data_dir="./Drug")

    # # 训练评估hmm模型
    # print("正在训练评估HMM模型...")
    # hmm_pred = hmm_train_eval(
    #     (train_word_lists, train_tag_lists),
    #     (test_word_lists, test_tag_lists),
    #     word2id,
    #     tag2id,
    #     remove_O = True
    # )

    # # 评估hmm模型
    # print("正在评估HMM模型...")
    # hmm_model = load_model("./ckpts/hmm.pkl")
    # hmm_pred = hmm_eval(
    #     hmm_model,
    #     (test_word_lists, test_tag_lists),
    #     word2id,
    #     tag2id,
    #     remove_O = True
    # )

    # 训练评估CRF模型
    print("正在训练评估CRF模型...")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))
コード例 #16
0
def main():

    print('读取数据...')
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus('train')
    dev_word_lists, dev_tag_lists = build_corpus('dev', maek_vocab = False)
    test_word_lists, test_tag_lists = build_corpus('test', maek_vocab = False)

    print('训练HMM模型...')
    hmm_model = HMMModel(len(tag2id), len(word2id))
    hmm_model.train(train_word_lists, train_tag_lists, word2id, tag2id)
    pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id)

    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores()

    print('训练CRF模型...')
    crf_model = CRFModel(max_iterations = 90)
    crf_model.train(train_word_lists, train_tag_lists)
    pred_tag_lists = crf_model.test(test_word_lists)

    metrics = Metrics(test_tag_lists, pred_tag_lists)
    metrics.report_scores()
    
    
    print('训练BiLSTM模型...')
    word2id, tag2id = extend_maps(word2id, tag2id)
    bilstm = BiLSTM(len(word2id), len(tag2id))
    bilstm.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8)
    bilstm.dev_test(test_word_lists, test_tag_lists, word2id, tag2id)
    bilstm.close_sess()
    

    print('训练BiLSTM-CRF模型...')
    bilstm_crf = BiLSTM_CRF(len(word2id), len(tag2id))
    bilstm_crf.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8)
    bilstm_crf.dev_test(test_word_lists, test_tag_lists, word2id, tag2id)
    bilstm_crf.close_sess()
コード例 #17
0
def main(corpus_dir, test_file):
    # 读取数据
    # print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train", data_dir=corpus_dir)
    test_word_lists, test_tag_lists = build_corpus_test(test_file)
    # print(test_word_lists, test_tag_lists)

    # print("加载并评估bilstm+crf模型...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                      test_tag_lists,
                                                      crf_word2id, crf_tag2id)
    for i in range(len(lstmcrf_pred)):
        for j in range(len(lstmcrf_pred[i])):
            print(test_word_lists[i][j] + "\t" + lstmcrf_pred[i][j])
        print()
コード例 #18
0
                        help='number of epochs')
    args = parser.parse_args()

    ## maybe load the model...
    if os.path.exists(args.save_dir):
        print("Loading Model...")
        model, vocab, index, metadata = load_model(args.save_dir)
        ## count parameters
        n_params = sum(np.prod(p.shape) for p in model.parameters())
        print("Using Model with %i Parameters" % n_params)

        print("Loading Dataset...")
        ## build training pipeline
        dataset = load_dataset(args.data_dir, tokenize=True)
        dataset = {k: v[1024:-4096] for k, v in dataset.items()}
        corpus = build_corpus(dataset, index, min_length=args.seq)
        pipeline = build_pipeline(corpus, n_seq=args.seq, n_batch=args.batch)
        print("Using %i Documents and %i Words" % (len(corpus), len(vocab)))

    else:
        print("Loading Dataset...")
        ## load data and build vocabulary
        dataset = load_dataset(args.data_dir, tokenize=True)
        dataset = {k: v[1024:-4096] for k, v in dataset.items()}
        vocab, index, lookup = build_vocab(dataset, args.term_freq,
                                           args.doc_freq)
        ## build corpus
        corpus = build_corpus(dataset, index, min_length=args.seq)
        ## build training pipeline
        pipeline = build_pipeline(corpus, n_seq=args.seq, n_batch=args.batch)
        ## save vocab
コード例 #19
0
def main():
    print("读取数据...")
    train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists, word2id, data2id = build_corpus(
        "train")
    dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = build_corpus(
        "dev", make_vocab=False)
    test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = build_corpus(
        "test", make_vocab=False)

    # # 训练评估hmm模型
    # print("正在训练评估HMM模型...")
    # hmm_pred = hmm_train_eval(
    #     (train_word_lists, train_tag_lists),
    #     (test_word_lists, test_tag_lists),
    #     word2id,
    #     tag2id
    # )

    # # 训练评估CRF模型
    # print("正在训练评估CRF模型...")
    # crf_pred = crf_train_eval(
    #     (train_word_lists, train_tag_lists),
    #     (test_word_lists, test_tag_lists)
    # )

    # 训练评估BI-LSTM模型
    print("正在训练评估双向LSTM模型...")
    # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
    bilstm_word2id, bilstm_data2id = extend_maps(word2id,
                                                 data2id,
                                                 for_crf=False)
    lstm_pred = bilstm_train_and_eval(
        (train_word_lists, train_data_lists, train_wordlabel_lists,
         train_datalabel_lists, train_dataptr_lists),
        (dev_word_lists, dev_data_lists, dev_wordlabel_lists,
         dev_datalabel_lists, dev_dataptr_lists),
        (test_word_lists, test_data_lists, test_wordlabel_lists,
         test_datalabel_lists, test_dataptr_lists),
        bilstm_word2id,
        bilstm_data2id,
        crf=False)
コード例 #20
0
'''
@Autor: xujiahuan
@Date: 2020-05-06 11:00:17
@LastEditors: xujiahuan
@LastEditTime: 2020-05-17 15:26:13
'''
from sklearn.externals import joblib
from data import build_corpus
from utils import extend_maps

train_path = 'data/train.txt'
# dev_path = 'data/dev.txt'
# test_path = 'data/test.txt'
train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus(train_path)
# dev_word_lists, dev_tag_lists = build_corpus(dev_path, make_vocab=False)
# test_word_lists, test_tag_lists = build_corpus(test_path, make_vocab=False)
bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)

test_word_lists = [['上', '海', '浦', '东']]
test_tag_lists = []
length = len(test_word_lists[0])
temp = ['O' for i in range(length)]
test_tag_lists.append(temp)


def rnn_pred2(test_word_lists, test_tag_lists):
    # start = time.time()
    # vocab_size = len(bilstm_word2id)
    # out_size = len(bilstm_tag2id)
    path = "ckpts/rnn.pkl"
コード例 #21
0
from utils import load_model, extend_maps, prepocess_data_for_lstmcrf
from data import build_corpus
import glob

HMM_MODEL_PATH = './ckpts/hmm.pkl'
CRF_MODEL_PATH = './ckpts/crf.pkl'
BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl'
BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl'

print("读取数据...")
train_word_lists, train_tag_lists, word2id, tag2id = \
    build_corpus("train")

print("加载并评估bilstm+crf模型...")
crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning

for item in glob.glob("data/chusai_xuanshou/" + "*.txt"):
    with open(item, encoding="utf-8") as f:
        origin_text = "".join(f.readlines())
        test_word_list = list(origin_text)
        test_word_lists = [test_word_list]
        test_tag_list = ["N" for _ in test_word_list]
        test_tag_lists = [test_tag_list]
        test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
            test_word_lists, test_tag_lists, test=True
        )
        lstmcrf_pred = bilstm_model.testA(test_word_lists,
                                          crf_word2id, crf_tag2id)
        print(test_word_list)
コード例 #22
0
def main():
    print("读取数据...")
    train_word_lists, train_data_lists, train_wordlabel_lists, train_datalabel_lists, train_dataptr_lists, word2id, data2id = build_corpus(
        "train")
    dev_word_lists, dev_data_lists, dev_wordlabel_lists, dev_datalabel_lists, dev_dataptr_lists = build_corpus(
        "dev", make_vocab=False)
    test_word_lists, test_data_lists, test_wordlabel_lists, test_datalabel_lists, test_dataptr_lists = build_corpus(
        "test", make_vocab=False)

    # bilstm模型
    print("加载并评估bilstm模型...")
    bilstm_word2id, bilstm_data2id = extend_maps(word2id,
                                                 data2id,
                                                 for_crf=False)
    bilstm_model = load_model(BiLSTM_MODEL_PATH)
    bilstm_model.model.bilstm.flatten_parameters()  # remove warning
    lstm_pred, target_tag_list = bilstm_model.test(
        test_word_lists, test_data_lists, test_wordlabel_lists,
        test_datalabel_lists, test_dataptr_lists, bilstm_word2id,
        bilstm_data2id)
    allnum = 0
    correct = 0
    f = open('test.tgt.dataptr1', 'w')
    for pred, gold in zip(lstm_pred, target_tag_list):
        pred = pred.cpu().numpy().tolist()[:len(gold)]
        f.write(' '.join([str(x) for x in pred]) + '\n')
        for x, y in zip(pred, gold):
            if x == y:
                correct += 1
            allnum += 1
    f.close()
    # TODO
    print(correct / allnum)
コード例 #23
0
def main_rep1(x, y):

    if x == 'train':
        # select data according to args.process
        print("Read data...")
        train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
        dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
        test_word_lists, test_tag_lists = build_corpus("test",
                                                       make_vocab=False)
        ######

        if y == 'crf':
            crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                      (test_word_lists, test_tag_lists))
            ensemble_evaluate([crf_pred], test_tag_lists)
        elif y == 'bilstm':
            bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                        tag2id,
                                                        for_crf=False)
            lstm_pred = bilstm_train_and_eval(
                (train_word_lists, train_tag_lists),
                (dev_word_lists, dev_tag_lists),
                (test_word_lists, test_tag_lists),
                bilstm_word2id,
                bilstm_tag2id,
                crf=False)
            ensemble_evaluate([lstm_pred], test_tag_lists)

        elif y == 'bilstm-crf':
            crf_word2id, crf_tag2id = extend_maps(word2id,
                                                  tag2id,
                                                  for_crf=True)
            # more data processing
            train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
                train_word_lists, train_tag_lists)
            dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
                dev_word_lists, dev_tag_lists)
            test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
                test_word_lists, test_tag_lists, test=True)
            lstmcrf_pred = bilstm_train_and_eval(
                (train_word_lists, train_tag_lists),
                (dev_word_lists, dev_tag_lists),
                (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)
            ensemble_evaluate([lstmcrf_pred], test_tag_lists)

    else:

        HMM_MODEL_PATH = './ckpts/hmm.pkl'
        CRF_MODEL_PATH = './ckpts/crf.pkl'
        BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl'
        BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl'

        REMOVE_O = False  # Whether to remove the O mark at the time of evaluation

        # select data according to args.process
        print("Read data...")
        train_word_lists, train_tag_lists, word2id, tag2id = \
            build_corpus("train")
        dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
        test_word_lists, test_tag_lists = build_corpus("test",
                                                       make_vocab=False)

        if y == 'crf':
            crf_model = load_model_1(CRF_MODEL_PATH)
            crf_pred = crf_model.test(test_word_lists)
            metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

        elif y == 'bilstm':
            bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                        tag2id,
                                                        for_crf=False)
            bilstm_model = load_model_1(BiLSTM_MODEL_PATH)
            bilstm_model.model.bilstm.flatten_parameters()  # remove warning
            lstm_pred, target_tag_list = bilstm_model.test(
                test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id)
            metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

        elif y == 'bilstm-crf':
            crf_word2id, crf_tag2id = extend_maps(word2id,
                                                  tag2id,
                                                  for_crf=True)
            bilstm_model = load_model_1(BiLSTMCRF_MODEL_PATH)
            bilstm_model.model.bilstm.bilstm.flatten_parameters(
            )  # remove warning
            test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
                test_word_lists, test_tag_lists, test=True)
            lstmcrf_pred, target_tag_list = bilstm_model.test(
                test_word_lists, test_tag_lists, crf_word2id, crf_tag2id)
            metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

    exit()
コード例 #24
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    #test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)
    test_word_lists, test_tag_lists, article_id = loadDevFile(
        "development_2.txt")

    # # 训练评估hmm模型
    # # print("正在训练评估HMM模型...")
    # # hmm_pred = hmm_train_eval(
    # #     (train_word_lists, train_tag_lists),
    # #     (dev_word_lists_, test_tag_lists),
    # #     word2id,
    # #     tag2id,
    # #     remove_O=True
    # # )

    # # output_pred(hmm_pred, article_id, dev_word_lists_raw)

    # # 训练评估CRF模型
    # print("正在训练评估CRF模型...")
    # crf_pred = crf_train_eval(
    #     (train_word_lists, train_tag_lists),
    #     (dev_word_lists_, test_tag_lists),
    #     remove_O=True
    # )
    # output_pred(crf_pred, article_id, dev_word_lists_raw,output_path = 'output_crf.tsv')

    # 训练评估BI-LSTM模型
    print("正在训练评估双向LSTM模型...")
    # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                      (dev_word_lists, dev_tag_lists),
                                      (test_word_lists, test_tag_lists),
                                      bilstm_word2id,
                                      bilstm_tag2id,
                                      crf=False,
                                      remove_O=True)

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                         (dev_word_lists, dev_tag_lists),
                                         (test_word_lists, test_tag_lists),
                                         crf_word2id,
                                         crf_tag2id,
                                         remove_O=True)

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
コード例 #25
0
def main():
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # print("加载并评估hmm模型...")
    # hmm_model = load_model(HMM_MODEL_PATH)
    # hmm_pred = hmm_model.test(test_word_lists,
    #                           word2id,
    #                           tag2id)
    # metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O)
    # metrics.report_scores()  # 打印每个标记的精确度、召回率、f1分数
    # metrics.report_confusion_matrix()  # 打印混淆矩阵
    #
    # # 加载并评估CRF模型
    # print("加载并评估crf模型...")
    # crf_model = load_model(CRF_MODEL_PATH)
    # crf_pred = crf_model.test(test_word_lists)
    # metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
    # metrics.report_scores()
    # metrics.report_confusion_matrix()
    #
    # # bilstm模型
    # print("加载并评估bilstm模型...")
    # bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    # bilstm_model = load_model(BiLSTM_MODEL_PATH)
    # bilstm_model.model.bilstm.flatten_parameters()  # remove warning
    # lstm_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists,
    #                                                bilstm_word2id, bilstm_tag2id)
    # metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
    # metrics.report_scores()
    # metrics.report_confusion_matrix()

    print("加载并评估bilstm+crf模型...")
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
    bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)

    lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                      test_tag_lists,
                                                      crf_word2id, crf_tag2id)
    metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

    # ensemble_evaluate(
    #     [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
    #     test_tag_lists
    # )

    ls = ['B-SYM', 'M-SYM', 'E-SYM']

    selected = [
        i for i in range(len(test_tag_lists[0])) if test_tag_lists[0][i] in ls
    ]
    selected_word = [test_word_lists[0][i] for i in selected]
    selected_predict = [
        i for i in range(len(lstmcrf_pred[0])) if lstmcrf_pred[0][i] in ls
    ]
    selected_predict_word = [test_word_lists[0][i] for i in selected_predict]

    for tag_list, doc in zip(train_tag_lists, train_word_lists):
        selected_train = [i for i in range(len(tag_list)) if tag_list[i] in ls]
        selected_train_word = [doc[i] for i in selected_train]
        # print(selected_train_word)

    print('preditct list:', lstmcrf_pred)
    print('target list:', target_tag_list)
    print(selected_word)
    print(selected_predict_word)
コード例 #26
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='main.py')
    parser.add_argument('--hmm',
                        action='store_true',
                        default=False,
                        help='Test HMM')
    parser.add_argument('--crf',
                        action='store_true',
                        default=False,
                        help='Test CRF')
    parser.add_argument('--bilstm',
                        action='store_true',
                        default=False,
                        help='Test BiLSTM')
    parser.add_argument('--bilstm-crf',
                        action='store_true',
                        default=False,
                        help='Test BiLSTM-CRF')
    parser.add_argument('--cbow',
                        action='store_true',
                        default=False,
                        help='Use CBOW embedding for BiLSTM-CRF')
    args = parser.parse_args()

    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    if args.hmm:
        print("加载并评估hmm模型...")
        hmm_model = load_model(HMM_MODEL_PATH)
        hmm_pred = hmm_model.test(test_word_lists, word2id, tag2id)
        metrics = Metrics(test_tag_lists, hmm_pred, remove_O=REMOVE_O)
        metrics.report_scores()  # 打印每个标记的精确度、召回率、f1分数
        metrics.report_confusion_matrix()  # 打印混淆矩阵

    # 加载并评估CRF模型
    if args.crf:
        print("加载并评估crf模型...")
        crf_model = load_model(CRF_MODEL_PATH)
        crf_pred = crf_model.test(test_word_lists)
        metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
        metrics.report_scores()
        metrics.report_confusion_matrix()

    # bilstm模型
    if args.bilstm:
        print("加载并评估bilstm模型...")
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        bilstm_model = load_model(BiLSTM_MODEL_PATH)
        bilstm_model.model.bilstm.flatten_parameters()  # remove warning
        lstm_pred, target_tag_list = bilstm_model.test(test_word_lists,
                                                       test_tag_lists,
                                                       bilstm_word2id,
                                                       bilstm_tag2id)
        metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
        metrics.report_scores()
        metrics.report_confusion_matrix()

    if args.bilstm_crf:
        print("加载并评估bilstm+crf模型...")
        crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
        bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
        bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
        test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
            test_word_lists, test_tag_lists, test=True)
        lstmcrf_pred, target_tag_list = bilstm_model.test(
            test_word_lists, test_tag_lists, crf_word2id, crf_tag2id)
        metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
        metrics.report_scores()
        metrics.report_confusion_matrix()
コード例 #27
0
'''
@Autor: xujiahuan
@Date: 2020-04-21 20:12:02
@LastEditors: xujiahuan
@LastEditTime: 2020-05-19 19:55:44
'''
from data import build_corpus
from models.crf import CRFModel
from metrics import Metrics
from utils import save_model

# 制作数据
train_path = 'data/train.txt'
dev_path = 'data/dev.txt'
test_path = 'data/test.txt'
train_word_lists, train_tag_lists = build_corpus(train_path, make_vocab=False)
dev_word_lists, dev_tag_lists = build_corpus(dev_path, make_vocab=False)
test_word_lists, test_tag_lists = build_corpus(test_path, make_vocab=False)


def crf_pred(train_word_lists, train_tag_lists, test_word_lists,
             test_tag_lists):
    model = CRFModel()
    model.train(train_word_lists, train_tag_lists)
    save_model(model, "./ckpts/crf.pkl")
    print(test_word_lists)
    pred = model.test(test_word_lists)
    return pred


print("正在训练CRF...")
コード例 #28
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='main.py')
    parser.add_argument('--hmm',
                        action='store_true',
                        default=False,
                        help='Train HMM')
    parser.add_argument('--crf',
                        action='store_true',
                        default=False,
                        help='Train CRF')
    parser.add_argument('--bilstm',
                        action='store_true',
                        default=False,
                        help='Train BiLSTM')
    parser.add_argument('--bilstm-crf',
                        action='store_true',
                        default=False,
                        help='Train BiLSTM-CRF')
    parser.add_argument('--cbow',
                        action='store_true',
                        default=False,
                        help='Train or use CBOW embedding for BiLSTM-CRF')
    args = parser.parse_args()
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 训练评估hmm模型
    if args.hmm:
        print("正在训练评估HMM模型...")
        hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists), word2id,
                                  tag2id)

    # 训练评估CRF模型
    if args.crf:
        print("正在训练评估CRF模型...")
        crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists))

    if args.bilstm:
        # 训练评估BI-LSTM模型
        print("正在训练评估双向LSTM模型...")
        # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                          (dev_word_lists, dev_tag_lists),
                                          (test_word_lists, test_tag_lists),
                                          bilstm_word2id,
                                          bilstm_tag2id,
                                          crf=False)

    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)

    if args.bilstm_crf:
        print("正在训练评估Bi-LSTM+CRF模型...")
        cbow_emb = None
        if args.cbow:
            print('Loading CBOW model')
            cbow_model = load_model('ckpts/cbow.pkl')
            cbow_emb = cbow_model.model.lookup_embedding()
            del cbow_model

        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id,
            crf_tag2id,
            cbow_emb=cbow_emb)

    elif args.cbow:
        print("正在训练CBOW模型...")
        cbow.CBOW_Model(len(crf_word2id)).train(train_word_lists, crf_word2id)
コード例 #29
0
ファイル: main.py プロジェクト: hjpwhu/Python_170703
                    default=2)
parser.add_argument('--emb_path',
                    dest='embed_path',
                    help='embedding path',
                    default='/home/hjp/Downloads/msc/glove.6B.50d.txt')
parser.add_argument('--data_path',
                    dest='data_path',
                    help='data set path',
                    default='/home/hjp/Downloads/msc/')
args = parser.parse_args()

torch.manual_seed(args.seed)

embed, vocab = data.build_embed(args.embed_path)
print torch.from_numpy(embed['world'])
word2idx, idx2word, trainlbl, testlbl = data.build_corpus(args.data_path)
print word2idx['chief']

trainlabel = torch.FloatTensor(torch.zeros(len(trainlbl), 1))
print trainlabel
for i in range(len(trainlbl)):
    print trainlbl[i]
    if trainlbl[i] == '1':
        trainlabel[i] = 1
    else:
        trainlabel[i] = 0
print trainlabel

testlabel = torch.FloatTensor(torch.zeros(len(testlbl), 1))
print testlabel
for i in range(len(testlbl)):