Exemple #1
0
def train_ner(x_train, y_train, x_valid, y_valid, x_test, y_test,
              sequence_length, epoch, batch_size, bert_model_path,
              model_save_path):
    """
    BERT-BiLSTM-CRF 模型训练,提取症状内部特征
    """
    bert_embedding = BERTEmbedding(bert_model_path,
                                   task=kashgari.LABELING,
                                   sequence_length=sequence_length)

    model = BiLSTM_CRF_Model(bert_embedding)

    eval_callback_val = EvalCallBack(kash_model=model,
                                     valid_x=x_valid,
                                     valid_y=y_valid,
                                     step=1)

    eval_callback_test = EvalCallBack(kash_model=model,
                                      valid_x=x_test,
                                      valid_y=y_test,
                                      step=1)

    model.fit(x_train,
              y_train,
              x_validate=x_valid,
              y_validate=y_valid,
              epochs=epoch,
              batch_size=batch_size,
              callbacks=[eval_callback_val, eval_callback_test])

    model.save(model_save_path)

    model.evaluate(x_test, y_test)

    return model
Exemple #2
0
def main():

    # train_x, train_y = ChineseDailyNerCorpus.load_data("train")
    # valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate")
    ChineseDailyNerCorpus.__zip_file__name
    test_x, test_y = ChineseDailyNerCorpus.load_data("test")

    # print(f"train data count: {len(train_x)}")
    # print(f"validate data count: {len(valid_x)}")
    print(f"test data count: {len(test_x)}")

    bert_embed = BERTEmbedding("models/chinese_L-12_H-768_A-12",
                               task=kashgari.LABELING,
                               sequence_length=100)
    model = BiLSTM_CRF_Model(bert_embed)
    # model.fit(
    #     train_x,
    #     train_y,
    #     x_validate=valid_x,
    #     y_validate=valid_y,
    #     epochs=1,
    #     batch_size=512,
    # )
    model.save("models/ner.h5")
    model.evaluate(test_x, test_y)
    predictions = model.predict_classes(test_x)
    print(predictions)
Exemple #3
0
def train_BERT_BiLSTM_CRF(
        train_test_devide=0.9,
        epoch=20,
        path='/home/peitian_zhang/data/corpus/labeled_train.txt'):
    train_x, train_y = getTrain(path)
    x = train_x[:int(len(train_x) * train_test_devide) + 1]
    y = train_y[:int(len(train_x) * train_test_devide) + 1]

    bert = BERTEmbedding(
        model_folder='/home/peitian_zhang/data/chinese_L-12_H-768_A-12',
        sequence_length=400,
        task=kashgari.LABELING)
    model = BiLSTM_CRF_Model(bert)

    model.fit(x, y, x, y, epochs=epoch, batch_size=64)

    print('---------evaluate on train---------\n{}'.format(
        model.evaluate(train_x, train_y)))
    print('---------evaluate on test----------\n{}'.format(
        model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:],
                       train_y[int(len(train_x) * train_test_devide) + 1:])))
    try:
        model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch))
        print('Success in saving!')
    except:
        pass
    return model
Exemple #4
0
def train_it2(train_path, checkpoint_filepath, model_path, start, span):
    data_generator = BIODataGenerator(train_path, 100000000)
    Xs, ys = data_generator.forfit().__next__()

    train_x, train_y = [], []
    valid_x, valid_y = [], []
    rng = np.random.RandomState(0)
    k = 0
    for x, y in zip(Xs, ys):
        # x = [str(i, 'utf-8') for i in x]
        # y = [str(i, 'utf-8') for i in y]
        rnum = rng.rand()
        k += 1
        if rnum < start or rnum >= start + span:
            train_x += [x]
            train_y += [y]
        else:
            valid_x += [x]
            valid_y += [y]
    # dataset = dataset.batch(32)
    print('====' * 8)
    print('total = ', k)
    print('start , span = ', (start, span))
    print('len train = ', len(train_x))
    # checkpoint_filepath = './checkpoint'
    if not os.path.exists(os.path.dirname(checkpoint_filepath)):
        os.mkdir(os.path.dirname(checkpoint_filepath))

    # train_x, train_y = ChineseDailyNerCorpus.load_data('train')
    # test_x, test_y = ChineseDailyNerCorpus.load_data('test')
    # valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=checkpoint_filepath,
    #     save_weights_only=True,
    #     monitor='val_accuracy',
    #     mode='max',
    #     save_best_only=True)
    #train_x, train_y = train_x[:1000], train_y[:1000]
    #valid_x, valid_y = valid_x[:200], valid_y[:200]

    model = BiLSTM_CRF_Model(bert_embed, sequence_length=128)
    eval_callback = Evaluator(model, checkpoint_filepath, valid_x, valid_y)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    reduse_lr_callback = keras.callbacks.ReduceLROnPlateau(factor=0.1,
                                                           patience=5)
    # eval_callback = EvalCallBack(kash_model=model,
    #                              x_data=valid_x,
    #                              y_data=valid_y,
    #                              step=1)

    model.fit(train_x,
              train_y,
              valid_x,
              valid_y,
              batch_size=64,
              epochs=20,
              callbacks=[early_stop, eval_callback, reduse_lr_callback])
    model.save(model_path)
Exemple #5
0
def train_it(train_path, checkpoint_filepath, model_path, start, span):
    dataset = build_dataset(train_path)
    train_x, train_y = [], []
    valid_x, valid_y = [], []
    rng = np.random.RandomState(0)
    k = 0
    for x, y in dataset.as_numpy_iterator():
        x = [str(i, 'utf-8') for i in x]
        y = [str(i, 'utf-8') for i in y]
        rnum = rng.rand()
        k += 1
        if rnum < start or rnum >= start + span:
            train_x += [x]
            train_y += [y]
        else:
            valid_x += [x]
            valid_y += [y]
    # dataset = dataset.batch(32)
    print('====' * 8)
    print('total = ', k)
    print('start , span = ', (start, span))
    print('len train = ', len(train_x))
    # checkpoint_filepath = './checkpoint'
    if not os.path.exists(os.path.dirname(checkpoint_filepath)):
        os.mkdir(os.path.dirname(checkpoint_filepath))

    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=checkpoint_filepath,
    #     save_weights_only=True,
    #     monitor='val_accuracy',
    #     mode='max',
    #     save_best_only=True)

    model = BiLSTM_CRF_Model(bert_embed, sequence_length=100)
    evaluator = Evaluator(model, checkpoint_filepath, valid_x, valid_y)
    model.fit(train_x,
              train_y,
              valid_x,
              valid_y,
              batch_size=64,
              epochs=20,
              callbacks=[evaluator])
    model.save(model_path)
Exemple #6
0
def train_BiLSTM_CRF(train_test_devide=0.9,
                     epoch=100,
                     path='/home/peitian_zhang/data/corpus/labeled_train.txt'):

    train_x, train_y = getTrain(path)
    model = BiLSTM_CRF_Model()

    x = train_x[:int(len(train_x) * train_test_devide) + 1]
    y = train_y[:int(len(train_x) * train_test_devide) + 1]

    model.fit(x, y, x, y, epochs=epoch, batch_size=64)
    print('---------evaluate on train---------\n{}'.format(
        model.evaluate(train_x, train_y)))
    print('---------evaluate on test----------\n{}'.format(
        model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:],
                       train_y[int(len(train_x) * train_test_devide) + 1:])))
    try:
        model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch))
        print('Success in saving!')
    except:
        pass
    return model
    tf_board_callback = keras.callbacks_v1.TensorBoard(log_dir='.\\logs',
                                                       update_freq=1000)
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=test_x,
                                 valid_y=test_y,
                                 step=4)

    model.fit(train_x,
              train_y,
              test_x,
              test_y,
              batch_size=20,
              epochs=4,
              callbacks=[eval_callback, tf_board_callback])
    model.evaluate(test_x, test_y)
    model.save('./model_8')

    # 预测结果
    df_out = pd.DataFrame(columns=['原文', '肿瘤原发部位', '原发病灶大小', '转移部位'])
    loaded_model = kashgari.utils.load_model('model_8')
    df = pd.read_excel("./data/test_no_ner.xlsx")
    for index, row in df.iterrows():
        data = row['原文']
        ''' Word2Vec '''
        Y_str, S_str, Z_str = predi_output(data, 'W2V', loaded_model)
        ''' Bert '''
        #Y_str, S_str, Z_str = predi_output(data, 'Bert', loaded_model)

        print(index, Y_str, S_str, Z_str)

        df_out = df_out.append(
Exemple #8
0
import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_CRF_Model
train_x, train_y = ChineseDailyNerCorpus.load_data('./data/train.txt')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('./data/dev.txt')
test_x, test_y  = ChineseDailyNerCorpus.load_data('./data/test.txt')

bert_embed = BERTEmbedding('./chinese_L-12_H-768_A-12',
                           task=kashgari.LABELING,
                           sequence_length=100)

# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)

model.save('saved_ner_model')
Exemple #9
0
            else:
                x.append(rows[0])
                y.append(rows[1])
    return data_x, data_y


train_x, train_y = get_sequenct_tagging_data(train_path)
dev_x, dev_y = get_sequenct_tagging_data(dev_path)
test_x, test_y = get_sequenct_tagging_data(test_path)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(dev_x)}")
print(f"test data count: {len(test_x)}")

bert_embed = BERTEmbedding(bert_path,
                           task=kashgari.LABELING,
                           sequence_length=100)

# 创建模型并训练
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=dev_x,
          y_validate=dev_y,
          epochs=20,
          batch_size=512)

model.save(model_path)

# 模型评估
model.evaluate(test_x, test_y)
Exemple #10
0
words, labels = [], []

count = 0
for data, label in zip(datafile, labelfile):
    count += 1
    s1 = data.strip().split(' ')
    s2 = label.strip().split(' ')

    words.append(s1)
    labels.append(s2)

train_x, test_x, train_y, test_y = train_test_split(words, labels, test_size=0.5, random_state=50)


bert_embed = BERTEmbedding('uncased_L-12_H-768_A-12',
                           trainable=False,
                           task=kashgari.LABELING,
                           sequence_length=20,
                           )
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=test_x,
          y_validate=test_y,
          epochs=35,
          batch_size=256)

model.save('model_bilstm_crf_35_256_64')

model.evaluate(x_data=test_x,y_data=test_y,batch_size=64,debug_info=True)
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

import kashgari
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                           task=kashgari.LABELING,
                           sequence_length=100)

from kashgari.tasks.labeling import BiLSTM_CRF_Model

# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`

model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)

model.save('models/org_loc_per_ner.h5')

model.evaluate(test_x, test_y)
Exemple #12
0
# model.evaluate(test_x, test_y)

import matplotlib.pyplot as plt
history = logs.history
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.legend(['val_loss'])
plt.title('accuracy')

model.evaluate(test_x, test_y)

# 驗證模型印出 precision、recall、f1
# model.evaluate(test_x, test_y)

# 保存模型到 `use_this_model` 目錄下
model.save('recall85_rbt3')

# 加載保存模型
# loaded_model = kashgari.utils.load_model('ner_model')

# # 使用模進行預測
# loaded.predict(test_x[0])

# import pandas as pd
# import kashgari
# import re
# import glob

# #使用這個目錄下的模型
# loaded_model = kashgari.utils.load_model('drive/Mydrive/final_model')
Exemple #13
0
import pickle
import kashgari
from kashgari.embeddings import BertEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
import tensorflow as tf

with open('data.pickle', 'rb') as f:
    data_dic = pickle.load(f)

x_train = data_dic[0]
x_validation = data_dic[1]
y_train = data_dic[2]
y_validation = data_dic[3]

embedding = BertEmbedding('bert-base-chinese',
                            sequence_length = 128)
model = BiLSTM_CRF_Model(embedding)

model.fit(  x_train = x_train,
            x_validate = x_validation,
            y_train = y_train,
            y_validate = y_validation,
            epochs=5,
            batch_size=32,
            )
model.save('Model')
model.evaluate(x_data=x_validation,y_data=y_validation)
Exemple #14
0
# 下面我们用 Bi_LSTM 模型实现一个命名实体识别任务:

from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model

# 加载内置数据集,此处可以替换成自己的数据集,保证格式一致即可
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

model = BiLSTM_CRF_Model()
model.fit(train_x, train_y, valid_x, valid_y, epochs=1)

model.save("BiLSTM_CRF_Model")
# -*- coding: utf-8 -*-
'''
训练包含:TIME的中文NER任务模型
'''
import kashgari
print(kashgari.__version__)
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

train_x, train_y = DataReader().read_conll_format_file(
    'data/data_all/time.train')
valid_x, valid_y = DataReader().read_conll_format_file(
    'data/data_all/time.dev')
test_x, test_y = DataReader().read_conll_format_file('data/data_all/time.test')

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=64, epochs=5)

model.save('models/time_ner.h5')

model.evaluate(test_x, test_y)
Exemple #16
0
import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari import utils

kashgari.config.use_cudnn_cell = False

train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train')
valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev')
test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test')

train_x, train_y = utils.unison_shuffled_copies(train_x, train_y)
valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y)
test_x, test_y = utils.unison_shuffled_copies(test_x, test_y)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=100)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20)

model.save('models/all_ner.h5')

model.evaluate(test_x, test_y)
Exemple #17
0
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          callbacks=[stop_callback, save_callback],
          batch_size=250,
          epochs=25)


# 验证模型,此方法将打印出详细的验证报告
model.evaluate(test_x, test_y)

# 保存模型到 `model_name` 目录下
model.save('5_29_1')

# 加载保存模型
loaded_model = kashgari.utils.load_model('5_29_1')

# 使用模型进行预测
loaded_model.predict(test_x[:100])




# print(len(all_sen), len(aspectTerm_t), len(all_ans))
# # 存成txt檔
# with open("/content/drive/My Drive/tibame/ans.txt","w") as f:
#         f.write(str(all_ans))
# with open("/content/drive/My Drive/tibame/sen.txt","w") as f:
# -*- coding: utf-8 -*-
# time: 2019-08-09 16:47
# place: Zhichunlu Beijing

import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

train_x, train_y = DataReader().read_conll_format_file('./data/time.train')
valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev')
test_x, test_y = DataReader().read_conll_format_file('./data/time.test')

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=10)

model.save('time_ner.h5')

model.evaluate(test_x, test_y)
with open("data_test.pkl", "rb") as f:
    x_test, y_test = pickle.load(f)
x_train, y_train = list(map(list, x_train)), list(map(list, y_train))
x_valid, y_valid = list(map(list, x_valid)), list(map(list, y_valid))
x_test, y_test = list(map(list, x_test)), list(map(list, y_test))
# Skip testing for now
x_train, y_train = x_train + x_test, y_train + y_test

model_dir = 'bert_tagger'
log_dir = os.path.join(model_dir, 'logs')
weights_path = os.path.join(log_dir, 'weights.h5')
BERT_PATH = '/mnt/DATA/data/embeddings/uncased_L-12_H-768_A-12'
EARLY_STOP = 10

bert_embed = BERTEmbedding(BERT_PATH, task=kashgari.LABELING)
model = BiLSTM_CRF_Model(bert_embed)
model.fit(x_train,
          y_train,
          x_valid,
          y_valid,
          epochs=10,
          batch_size=64,
          callbacks=[
              TensorBoard(log_dir=log_dir, write_graph=False),
              ModelCheckpoint(weights_path, save_weights_only=True),
              ReduceLROnPlateau()
          ])
print('Saving the model...')
model.save(model_dir)

from kashgari.utils import load_model