Ejemplo n.º 1
0
def train_ner(x_train, y_train, x_valid, y_valid, x_test, y_test,
              sequence_length, epoch, batch_size, bert_model_path,
              model_save_path):
    """
    BERT-BiLSTM-CRF 模型训练,提取症状内部特征
    """
    bert_embedding = BERTEmbedding(bert_model_path,
                                   task=kashgari.LABELING,
                                   sequence_length=sequence_length)

    model = BiLSTM_CRF_Model(bert_embedding)

    eval_callback_val = EvalCallBack(kash_model=model,
                                     valid_x=x_valid,
                                     valid_y=y_valid,
                                     step=1)

    eval_callback_test = EvalCallBack(kash_model=model,
                                      valid_x=x_test,
                                      valid_y=y_test,
                                      step=1)

    model.fit(x_train,
              y_train,
              x_validate=x_valid,
              y_validate=y_valid,
              epochs=epoch,
              batch_size=batch_size,
              callbacks=[eval_callback_val, eval_callback_test])

    model.save(model_save_path)

    model.evaluate(x_test, y_test)

    return model
Ejemplo n.º 2
0
def main():
    train_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8'
    dev_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8'
    test_path = '/home/qianlang/WordSeg-master/Data/test/data_generate_test.utf8'
    # dev_path = r'D:\Pycharm\Project\data_analyze\Data_processing\data_generate\data_generate_dev.utf8'

    train_x, train_y = load_dataset(train_path)
    dev_x, dev_y = load_dataset(dev_path)
    test_x, test_y = load_dataset(test_path)

    bert_embed = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)

    model = BiGRU_CRF_Model(bert_embed)

    tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs',
                                                    update_freq=1000)

    # Build-in callback for print precision, recall and f1 at every epoch step
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=dev_x,
                                 valid_y=dev_y,
                                 step=5)

    model.fit(train_x,
              train_y,
              dev_x,
              dev_y,
              batch_size=256,
              callbacks=[eval_callback, tf_board_callback])

    model.evaluate(test_x, test_y)

    model.save('cws.h5')
Ejemplo n.º 3
0
    def test_predict_and_callback(self):
        from kashgari.corpus import ChineseDailyNerCorpus
        from kashgari.callbacks import EvalCallBack

        train_x, train_y = ChineseDailyNerCorpus.load_data('train')
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

        model = BiGRU_Model(sequence_length=10)

        eval_callback = EvalCallBack(kash_model=model,
                                     x_data=valid_x[:200],
                                     y_data=valid_y[:200],
                                     truncating=True,
                                     step=1)

        model.fit(train_x[:300], train_y[:300],
                  valid_x[:200], valid_y[:200],
                  epochs=1,
                  callbacks=[eval_callback])
        response = model.predict(train_x[:200], truncating=True)
        lengths = [len(i) for i in response]
        assert all([(i <= 10) for i in lengths])

        response = model.predict(train_x[:200])
        lengths = [len(i) for i in response]
        assert not all([(i <= 10) for i in lengths])
Ejemplo n.º 4
0
def main():
    #注意修改路径
    train_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8'
    dev_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_dev.utf8'
    test_path = '/home/qianlang/WordSeg-master/Data/test/data_generate_test.utf8'
    # dev_path = r'D:\Pycharm\Project\data_analyze\Data_processing\data_generate\data_generate_dev.utf8'

    train_x, train_y = load_dataset(train_path)
    dev_x, dev_y = load_dataset(dev_path)
    test_x, test_y = load_dataset(test_path)

    #embed = WordEmbedding("/home/qianlang/CWS_CRF/word2vec/news_12g_baidubaike_20g_novel_90g_embedding_64.bin")
    #embed.analyze_corpus(x_data, y_data)

    #model = CNN_LSTM_Model(sequence_length=128)
    hyper = BiGRU_Model.default_hyper_parameters()
    hyper['layer_dropout']['rate'] = 0.2
    model = BiGRU_Model(hyper_parameters=hyper)

    tf_board_callback = keras.callbacks.TensorBoard(
        log_dir='./logs/bi_gru_aug',
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        embeddings_freq=1,
        embeddings_layer_names=None,
        embeddings_metadata=None,
        update_freq=1000)

    # Build-in callback for print precision, recall and f1 at every epoch step
    eval_callback = EvalCallBack(kash_model=model,
                                 x_data=dev_x,
                                 y_data=dev_y,
                                 truncating=True,
                                 step=2)

    model.fit(train_x,
              train_y,
              dev_x,
              dev_y,
              batch_size=128,
              epochs=10,
              callbacks=[eval_callback, tf_board_callback])

    model.evaluate(test_x, test_y)

    model.save('bi_gru_aug.h5')
def main():
    #注意修改对应的路径
    train_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8'
    dev_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_dev.utf8'
    test_path = '/home/qianlang/WordSeg-master/Data/test/data_generate_test.utf8'
    # dev_path = r'D:\Pycharm\Project\data_analyze\Data_processing\data_generate\data_generate_dev.utf8'

    train_x, train_y = load_dataset(train_path)
    dev_x, dev_y = load_dataset(dev_path)
    test_x, test_y = load_dataset(test_path)

    bert_embed = BertEmbedding('chinese_wwm_ext_L-12_H-768_A-12')

    model = BiGRU_CRF_Model(bert_embed, sequence_length=128)

    tf_board_callback = keras.callbacks.TensorBoard(
        log_dir='./logs/bert_bigru_crf1',
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        embeddings_freq=1,
        embeddings_layer_names=None,
        embeddings_metadata=None,
        update_freq=1000)

    # Build-in callback for print precision, recall and f1 at every epoch step
    eval_callback = EvalCallBack(kash_model=model,
                                 x_data=dev_x,
                                 y_data=dev_y,
                                 truncating=True,
                                 step=1)

    model.fit(train_x,
              train_y,
              dev_x,
              dev_y,
              batch_size=128,
              epochs=20,
              callbacks=[eval_callback, tf_board_callback])

    model.evaluate(test_x, test_y)

    model.save('cws_wwm_bert_bigru_crf.h5')
Ejemplo n.º 6
0
    def run_with_model_class(self, model_class: Type[ABCLabelingModel], epochs: int):
        bert_path = get_bert_path()

        train_x, train_y = ChineseDailyNerCorpus.load_data('train')
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
        test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        bert_embed = BertEmbedding(bert_path)
        model = model_class(bert_embed)

        log_path = os.path.join(log_root, model_class.__name__)
        file_writer = tf.summary.create_file_writer(log_path + "/metrics")
        file_writer.set_as_default()
        callbacks = [EvalCallBack(model, test_x, test_y, step=1, truncating=True)]
        # callbacks = []
        model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks)

        report = model.evaluate(test_x, test_y)
        del model
        del bert_embed
        return report
Ejemplo n.º 7
0
    #                             sequence_length = 150)
    ''' Word2Vec Embeddings '''
    word2vec_embedding = kashgari.embeddings.WordEmbedding(
        w2v_path="word2vec.model",
        task=kashgari.LABELING,
        w2v_kwargs={
            'binary': True,
            'unicode_errors': 'ignore'
        },
        sequence_length='auto')
    model = BiLSTM_CRF_Model(word2vec_embedding)
    #model = BiLSTM_CRF_Model(embedding)
    tf_board_callback = keras.callbacks_v1.TensorBoard(log_dir='.\\logs',
                                                       update_freq=1000)
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=test_x,
                                 valid_y=test_y,
                                 step=4)

    model.fit(train_x,
              train_y,
              test_x,
              test_y,
              batch_size=20,
              epochs=4,
              callbacks=[eval_callback, tf_board_callback])
    model.evaluate(test_x, test_y)
    model.save('./model_8')

    # 预测结果
    df_out = pd.DataFrame(columns=['原文', '肿瘤原发部位', '原发病灶大小', '转移部位'])
    loaded_model = kashgari.utils.load_model('model_8')
Ejemplo n.º 8
0
                           sequence_length=128)


from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.python import keras
from kashgari.callbacks import EvalCallBack

#patience=3是看每一個epoch
stop_callback = EarlyStopping(patience=5, restore_best_weights=True)
# save_callback = ModelCheckpoint("530test1.h5",save_best_only=True,save_weights_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-6)
# tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', update_freq=1000)
model = BiLSTM_CRF_Model(bert_embed)
eval_callback = EvalCallBack(kash_model=model,
                             valid_x=valid_x,
                             valid_y=valid_y,
                             step=3)


# optimizer = RAdam()
# model.compile_model(optimizer=optimizer)


model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          callbacks=[stop_callback,reduce_lr,eval_callback],
          batch_size=128,
          epochs=10)
Ejemplo n.º 9
0
        if loss is None:
            loss = self.crf_layer.loss
        if metrics is None:
            metrics = [self.crf_layer.accuracy]
        super(BiGRU_CRF_Model, self).compile_model(loss=loss,
                                                   optimizer=optimizer,
                                                   metrics=metrics,
                                                   **kwargs)


if __name__ == "__main__":
    from kashgari.corpus import ChineseDailyNerCorpus
    from kashgari.callbacks import EvalCallBack

    train_x, train_y = ChineseDailyNerCorpus.load_data('train')
    valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
    test_x, test_y = ChineseDailyNerCorpus.load_data('test')

    model = BiGRU_CRF_Model(sequence_length=10)

    eval_callback = EvalCallBack(kash_model=model,
                                 x_data=valid_x,
                                 y_data=valid_y,
                                 truncating=True,
                                 step=1)

    model.fit(train_x, train_y, valid_x, valid_y, epochs=1, callbacks=[])
    y = model.predict(test_x[:200])
    model.tf_model.summary()
    model.evaluate(test_x, test_y)
def main():
    # parser config
    config_file = "./config.ini"
    cp = ConfigParser()
    cp.read(config_file)

    # default config
    output_fold = cp["TRAIN"].get("output_fold")
    epochs = cp["TRAIN"].getint("epochs")
    batch_size = cp["TRAIN"].getint("batch_size")
    generator_workers = cp["TRAIN"].getint("generator_workers")
    output_weights_name = cp["TRAIN"].get("output_weights_name")
    sequence_length_max = cp["TRAIN"].getint("sequence_length_max")
    output_model_name = cp["TRAIN"].get("output_model_name")
    save_weights_only = cp["TRAIN"].getboolean("save_weights_only")
    cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode")
    base_lr = cp["TRAIN"].getfloat("base_lr")
    max_lr = cp["TRAIN"].getfloat("max_lr")
    file_train = cp["TRAIN"].get("file_train")
    file_valid = cp["TRAIN"].get("file_valid")
    file_test = cp["TRAIN"].get("file_test")

    today = datetime.date.today()
    formatted_today = today.strftime('%y%m%d')
    output_dir = os.path.join('experiments', formatted_today, output_fold)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_dir_src = os.path.join(output_dir, 'src')
    if not os.path.isdir(output_dir_src):
        os.makedirs(output_dir_src)
    print(f"backup config file to {output_dir_src}")
    shutil.copy(config_file, os.path.join(output_dir_src, os.path.split(config_file)[1]))
    train_file = os.path.basename(__file__)
    shutil.copy(train_file, os.path.join(output_dir_src, train_file))
    logging.basicConfig(level='DEBUG')
    bert_path = get_file('bert_sample_model',
                         "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
                         cache_dir=DATA_PATH,
                         untar=True)

    train_x, train_y = preprocess(file_train)
    validate_x, validate_y = preprocess(file_valid)
    test_x, test_y = preprocess(file_test)

    #'bert-large-cased'
    embedding = BERTEmbedding(bert_path,
                              sequence_length=sequence_length_max,
                              task=kashgari.CLASSIFICATION,
                              trainable=True,
                              layer_nums=4)
    #embedding = BERTEmbedding('/home/ys1/pretrained_models/BERT/Japanese_L-12_H-768_A-12_E-30_BPE/', sequence_length=sequence_length_max, task=kashgari.CLASSIFICATION)

    # 还可以选择 CNNModel CNNLSTMModel
    # model = BiGRU_Model(embedding)
    hyper = BiLSTM_Model.get_default_hyper_parameters()
    print(f'hyper parameters is:{hyper}')
    #hyper parameters is:{'layer_bi_lstm': {'units': 128, 'return_sequences': False}, 'layer_dense': {'activation': 'softmax'}}
    # hyper['layer_bi_lstm']['units'] = 32
    model = BiLSTM_Model(embedding, hyper_parameters=hyper)
    # model.build_model(train_x, train_y)
    # model.build_multi_gpu_model(gpus=2)
    # print(model.summary())

    if save_weights_only:
        model_weights = os.path.join(output_dir, output_weights_name)
    else:
        model_weights = os.path.join(output_dir, output_model_name)

    checkpoint = keras.callbacks.ModelCheckpoint(
        model_weights,
        save_weights_only=save_weights_only,
        save_best_only=True,
        verbose=1,
    )
    earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='min')
    csv_logger = keras.callbacks.CSVLogger(os.path.join(output_dir, 'training.csv'))
    batch_size_cycliclr = ceil(len(train_x)/batch_size)
    if cyclicLR_mode == 'exp_range':
        gamma = 0.99994
    else:
        gamma = 1.
    clr = CyclicLR(mode=cyclicLR_mode, step_size=batch_size_cycliclr, base_lr=base_lr, max_lr=max_lr, gamma=gamma)
    save_min_loss = SaveMinLoss(filepath=output_dir)
    tb = keras.callbacks.TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size, update_freq=1000)
    # 这是 Kashgari 内置回调函数,会在训练过程计算精确度,召回率和 F1
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=validate_x,
                                 valid_y=validate_y,
                                 step=5)
    callbacks = [
        eval_callback,
        checkpoint,
        tb,
        csv_logger,
        # clr,
        save_min_loss,
        earlystop,
    ]

    print("** start training **")
    model.fit(train_x,
              train_y,
              x_validate=validate_x,
              y_validate=validate_y,
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks,
              fit_kwargs={
                          'workers': generator_workers,
                          'use_multiprocessing': True,
                          'class_weight': 'auto',
                          }
              )

    model_path = os.path.join(output_dir, 'model')
    model.save(model_path)
    report_evaluate = model.evaluate(test_x, test_y, debug_info=True)

    with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f:
        f.write(f"The evaluate report is : \n{str(report_evaluate)}")