Example #1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default="BILSTM",
                        help="model in [HMM,CRF,BILSTM,BILSTM-CRF,ENSEMBLE]",
                        type=str)

    params = vars(parser.parse_args())
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    if params["model"] == "HMM":
        # 训练评估HMM模型
        print("正在训练评估HMM模型...")
        hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists), word2id,
                                  tag2id)
    elif params["model"] == "CRF":
        # 训练评估CRF模型
        print("正在训练评估CRF模型...")
        crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists))
    elif params["model"] == "BILSTM":
        # 训练评估BI-LSTM模型
        print("正在训练评估BI-LSTM模型...")
        # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                          (dev_word_lists, dev_tag_lists),
                                          (test_word_lists, test_tag_lists),
                                          bilstm_word2id,
                                          bilstm_tag2id,
                                          crf=False)
    elif params["model"] == "BILSTM-CRF":
        print("正在训练评估Bi-LSTM+CRF模型...")
        # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
        crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
        # 还需要额外的一些数据处理
        train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
            train_word_lists, train_tag_lists)
        dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
            dev_word_lists, dev_tag_lists)
        test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
            test_word_lists, test_tag_lists, test=True)
        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id, crf_tag2id)
    elif params["model"] == "ENSEMBLE":
        # 模型集成
        print()
Example #2
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    data_folder = "./data123"
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train", data_dir=data_folder)
    dev_word_lists, dev_tag_lists = build_corpus("dev",
                                                 make_vocab=False,
                                                 data_dir=data_folder)
    test_word_lists, test_tag_lists = build_corpus("test",
                                                   make_vocab=False,
                                                   data_dir=data_folder)

    # 训练评估hmm模型
    print("正在训练评估HMM模型...")
    hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists), word2id,
                              tag2id)

    # 训练评估CRF模型
    print("正在训练评估CRF模型...")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))

    # 训练评估BI-LSTM模型
    print("正在训练评估双向LSTM模型...")
    # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                      (dev_word_lists, dev_tag_lists),
                                      (test_word_lists, test_tag_lists),
                                      bilstm_word2id,
                                      bilstm_tag2id,
                                      crf=False)

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
Example #3
0
def main():
    """模型训练与评估"""

    # 读取数据
    print("读取数据中...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    #训练并评估hmm模型
    print("正在训练评估HMM模型")
    hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists), word2id,
                              tag2id)

    # 训练并评估crf模型
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))

    #训练并评估bilstm模型
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                      (dev_word_lists, dev_tag_lists),
                                      (test_word_lists, test_tag_lists),
                                      bilstm_word2id,
                                      bilstm_tag2id,
                                      crf=False)

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    print(' '.join([i[0] for i in crf_tag2id.items()]))
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)
    lstmcrf_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)

    ensemble_evaluate([hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
                      test_tag_lists)
Example #4
0
def main():
    """训练模型,评估结果"""

    text = '''
        ####
        没有使用老师提供的数据集,O标签太多(占比92.77%),模型训练效果不好
        新数据集取自
        https://github.com/luopeixiang/named_entity_recognition
        ####'''
    print(text, '\n')
    # 读取数据
    print("读取数据...\n")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 训练评估CRF模型
    print("训练并评估CRF模型...\n")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))
Example #5
0
def main():
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train", data_dir="./Drug")
    # dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False, data_dir="./Drug")
    test_word_lists, test_tag_lists = build_corpus("test",
                                                   make_vocab=False,
                                                   data_dir="./Drug")

    # # 训练评估hmm模型
    # print("正在训练评估HMM模型...")
    # hmm_pred = hmm_train_eval(
    #     (train_word_lists, train_tag_lists),
    #     (test_word_lists, test_tag_lists),
    #     word2id,
    #     tag2id,
    #     remove_O = True
    # )

    # # 评估hmm模型
    # print("正在评估HMM模型...")
    # hmm_model = load_model("./ckpts/hmm.pkl")
    # hmm_pred = hmm_eval(
    #     hmm_model,
    #     (test_word_lists, test_tag_lists),
    #     word2id,
    #     tag2id,
    #     remove_O = True
    # )

    # 训练评估CRF模型
    print("正在训练评估CRF模型...")
    crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                              (test_word_lists, test_tag_lists))
Example #6
0
def main(args):
    """训练模型,评估结果"""

    output_directory = os.path.join('ckpts', args.name)

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    shutil.copy2('models/config.py', output_directory)

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train", fix_length=-1)
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 训练评估HMM模型
    print("正在训练评估HMM模型...")
    hmm_pred = hmm_train_eval(
        (train_word_lists, train_tag_lists),
        (test_word_lists, test_tag_lists),
        word2id,
        tag2id,
        output_directory
    )

    # 训练评估CRF模型
    print("正在训练评估CRF模型...")
    crf_pred = crf_train_eval(
        (train_word_lists, train_tag_lists),
        (test_word_lists, test_tag_lists),
        output_directory
    )

    # 训练评估BI-LSTM模型
    print("正在训练评估双向LSTM模型...")
    # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
    bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
    lstm_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists),
        (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists),
        bilstm_word2id, bilstm_tag2id,
        output_directory,
        crf=False
    )

    print("正在训练评估Bi-LSTM+CRF模型...")
    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists
    )
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists
    )
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True
    )
    lstmcrf_pred = bilstm_train_and_eval(
        (train_word_lists, train_tag_lists),
        (dev_word_lists, dev_tag_lists),
        (test_word_lists, test_tag_lists),
        crf_word2id, crf_tag2id,
        output_directory
    )

    ensemble_evaluate(
        [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
        test_tag_lists
    )
Example #7
0
def main():
    """Training model and evaluating results!"""
    # selecting model
    do_hmm_in_main = False
    do_crf_in_main = False
    do_bilstm_in_main = False
    do_bilstmcrf_in_main = True
    do_ensemble_in_main = False
    ensemble_model_list = []

    # Data
    print("Reading data:")
    ner_data_dir = "./datasets/FA_NER_Data_IOB"
    train_word_lists, train_tag_lists, word2id, tag2id = build_corpus(
        "train", data_dir=ner_data_dir)
    dev_word_lists, dev_tag_lists = build_corpus("dev",
                                                 make_vocab=False,
                                                 data_dir=ner_data_dir)
    test_word_lists, test_tag_lists = build_corpus("test",
                                                   make_vocab=False,
                                                   data_dir=ner_data_dir)
    print("len(train_word_lists):", len(train_word_lists))
    print("len(word2id=vocab):", len(word2id))

    if do_hmm_in_main:
        # Training and Evaluating HMM model
        print("Training and Evaluating HMM model:")
        hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists), word2id,
                                  tag2id)
        ensemble_model_list.append(hmm_pred)

    if do_crf_in_main:
        # Training and evaluating CRF model
        print("Training and evaluating CRF model:")
        crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists))
        ensemble_model_list.append(crf_pred)

    if do_bilstm_in_main:
        # Training and evaluating BI-LSTM model
        print("Training and evaluating Bi-LSTM model:")
        # We need to put 'PAD' and 'UNK' in word2id and tag2id, when we train LSTM model.
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                          (dev_word_lists, dev_tag_lists),
                                          (test_word_lists, test_tag_lists),
                                          bilstm_word2id,
                                          bilstm_tag2id,
                                          crf=False)
        ensemble_model_list.append(lstm_pred)

    if do_bilstmcrf_in_main:
        # Training and evaluating Bi-LSTM+CRF model
        print("Training and evaluating Bi-LSTM-CRF model:")
        # We need to add <start> and <end>, when we use lstm model with CRF (will be used during decoder processing).
        crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
        # data processing
        train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
            train_word_lists, train_tag_lists)
        dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
            dev_word_lists, dev_tag_lists)
        test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
            test_word_lists, test_tag_lists, test=True)
        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id,
            crf_tag2id,
            remove_O=False,
            reload_model=True)
        ensemble_model_list.append(lstmcrf_pred)

    if do_ensemble_in_main:
        ensemble_evaluate(ensemble_model_list, test_tag_lists)
def main():
    import argparse
    parser = argparse.ArgumentParser(description='main.py')
    parser.add_argument('--hmm',
                        action='store_true',
                        default=False,
                        help='Train HMM')
    parser.add_argument('--crf',
                        action='store_true',
                        default=False,
                        help='Train CRF')
    parser.add_argument('--bilstm',
                        action='store_true',
                        default=False,
                        help='Train BiLSTM')
    parser.add_argument('--bilstm-crf',
                        action='store_true',
                        default=False,
                        help='Train BiLSTM-CRF')
    parser.add_argument('--cbow',
                        action='store_true',
                        default=False,
                        help='Train or use CBOW embedding for BiLSTM-CRF')
    args = parser.parse_args()
    """训练模型,评估结果"""

    # 读取数据
    print("读取数据...")
    train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
    dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
    test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

    # 训练评估hmm模型
    if args.hmm:
        print("正在训练评估HMM模型...")
        hmm_pred = hmm_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists), word2id,
                                  tag2id)

    # 训练评估CRF模型
    if args.crf:
        print("正在训练评估CRF模型...")
        crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                  (test_word_lists, test_tag_lists))

    if args.bilstm:
        # 训练评估BI-LSTM模型
        print("正在训练评估双向LSTM模型...")
        # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
        bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                    tag2id,
                                                    for_crf=False)
        lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                          (dev_word_lists, dev_tag_lists),
                                          (test_word_lists, test_tag_lists),
                                          bilstm_word2id,
                                          bilstm_tag2id,
                                          crf=False)

    # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到)
    crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
    # 还需要额外的一些数据处理
    train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
        train_word_lists, train_tag_lists)
    dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
        dev_word_lists, dev_tag_lists)
    test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
        test_word_lists, test_tag_lists, test=True)

    if args.bilstm_crf:
        print("正在训练评估Bi-LSTM+CRF模型...")
        cbow_emb = None
        if args.cbow:
            print('Loading CBOW model')
            cbow_model = load_model('ckpts/cbow.pkl')
            cbow_emb = cbow_model.model.lookup_embedding()
            del cbow_model

        lstmcrf_pred = bilstm_train_and_eval(
            (train_word_lists, train_tag_lists),
            (dev_word_lists, dev_tag_lists), (test_word_lists, test_tag_lists),
            crf_word2id,
            crf_tag2id,
            cbow_emb=cbow_emb)

    elif args.cbow:
        print("正在训练CBOW模型...")
        cbow.CBOW_Model(len(crf_word2id)).train(train_word_lists, crf_word2id)
def main_rep1(x, y):

    if x == 'train':
        # select data according to args.process
        print("Read data...")
        train_word_lists, train_tag_lists, word2id, tag2id = \
        build_corpus("train")
        dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
        test_word_lists, test_tag_lists = build_corpus("test",
                                                       make_vocab=False)
        ######

        if y == 'crf':
            crf_pred = crf_train_eval((train_word_lists, train_tag_lists),
                                      (test_word_lists, test_tag_lists))
            ensemble_evaluate([crf_pred], test_tag_lists)
        elif y == 'bilstm':
            bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                        tag2id,
                                                        for_crf=False)
            lstm_pred = bilstm_train_and_eval(
                (train_word_lists, train_tag_lists),
                (dev_word_lists, dev_tag_lists),
                (test_word_lists, test_tag_lists),
                bilstm_word2id,
                bilstm_tag2id,
                crf=False)
            ensemble_evaluate([lstm_pred], test_tag_lists)

        elif y == 'bilstm-crf':
            crf_word2id, crf_tag2id = extend_maps(word2id,
                                                  tag2id,
                                                  for_crf=True)
            # more data processing
            train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf(
                train_word_lists, train_tag_lists)
            dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf(
                dev_word_lists, dev_tag_lists)
            test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
                test_word_lists, test_tag_lists, test=True)
            lstmcrf_pred = bilstm_train_and_eval(
                (train_word_lists, train_tag_lists),
                (dev_word_lists, dev_tag_lists),
                (test_word_lists, test_tag_lists), crf_word2id, crf_tag2id)
            ensemble_evaluate([lstmcrf_pred], test_tag_lists)

    else:

        HMM_MODEL_PATH = './ckpts/hmm.pkl'
        CRF_MODEL_PATH = './ckpts/crf.pkl'
        BiLSTM_MODEL_PATH = './ckpts/bilstm.pkl'
        BiLSTMCRF_MODEL_PATH = './ckpts/bilstm_crf.pkl'

        REMOVE_O = False  # Whether to remove the O mark at the time of evaluation

        # select data according to args.process
        print("Read data...")
        train_word_lists, train_tag_lists, word2id, tag2id = \
            build_corpus("train")
        dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
        test_word_lists, test_tag_lists = build_corpus("test",
                                                       make_vocab=False)

        if y == 'crf':
            crf_model = load_model_1(CRF_MODEL_PATH)
            crf_pred = crf_model.test(test_word_lists)
            metrics = Metrics(test_tag_lists, crf_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

        elif y == 'bilstm':
            bilstm_word2id, bilstm_tag2id = extend_maps(word2id,
                                                        tag2id,
                                                        for_crf=False)
            bilstm_model = load_model_1(BiLSTM_MODEL_PATH)
            bilstm_model.model.bilstm.flatten_parameters()  # remove warning
            lstm_pred, target_tag_list = bilstm_model.test(
                test_word_lists, test_tag_lists, bilstm_word2id, bilstm_tag2id)
            metrics = Metrics(target_tag_list, lstm_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

        elif y == 'bilstm-crf':
            crf_word2id, crf_tag2id = extend_maps(word2id,
                                                  tag2id,
                                                  for_crf=True)
            bilstm_model = load_model_1(BiLSTMCRF_MODEL_PATH)
            bilstm_model.model.bilstm.bilstm.flatten_parameters(
            )  # remove warning
            test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
                test_word_lists, test_tag_lists, test=True)
            lstmcrf_pred, target_tag_list = bilstm_model.test(
                test_word_lists, test_tag_lists, crf_word2id, crf_tag2id)
            metrics = Metrics(target_tag_list, lstmcrf_pred, remove_O=REMOVE_O)
            metrics.report_scores()
            metrics.report_confusion_matrix()

    exit()