Ejemplo n.º 1
0
train_x, train_y = read_data_file('cnews/cnews.train.txt')
val_x, val_y = read_data_file('cnews/cnews.val.txt')

# 初始化 word2vec embedding

import kashgari
from kashgari.tasks.classification import BiLSTM_Model
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding('baidu_ernie',
                           task=kashgari.CLASSIFICATION,
                           sequence_length=600)
model = BiLSTM_Model(bert_embed)
# 初始化 BERT embedding
# from kashgari.embeddings import BERTEmbedding
# # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600)
# #
# # # 使用 embedding 初始化模型
# # from kashgari.tasks.classification import CNNModel
# # model = CNNModel(embedding)

#可视化   目录下命令可以$ tensorboard --log-dir logs
import keras
model.fit(train_x,
          train_y,
          val_x,
          val_y,epochs=5)
model.evaluate(test_x, test_y)
model.save('./model')

# 准备训练测试数据集
train_x = list(df['cutted'][:int(len(df) * 0.7)])
train_y = list(df['label'][:int(len(df) * 0.7)])

valid_x = list(df['cutted'][int(len(df) * 0.7):int(len(df) * 0.85)])
valid_y = list(df['label'][int(len(df) * 0.7):int(len(df) * 0.85)])

test_x = list(df['cutted'][int(len(df) * 0.85):])
test_y = list(df['label'][int(len(df) * 0.85):])

import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.classification import BiLSTM_Model

BERT_PATH = '<BERT/ERNIE 模型路径>'

# 初始化 Embedding
embed = BERTEmbedding(BERT_PATH,
                      task=kashgari.CLASSIFICATION,
                      sequence_length=64)

# 使用 embedding 初始化模型
model = BiLSTM_Model(embed)
# 先只训练一轮
model.fit(train_x, train_y, valid_x, valid_y, batch_size=1024, epochs=1)
model.evaluate(test_x, test_y, batch_size=512)
model.save('bert_model')
import kashgari
kashgari.utils.convert_to_saved_model(model, 'tf_bert_model', version=1)
Ejemplo n.º 3
0
    return x_list, y_list

test_x, test_y = read_data_file('cnews/cnews.test.txt')
train_x, train_y = read_data_file('cnews/cnews.train.txt')
val_x, val_y = read_data_file('cnews/cnews.val.txt')

# 初始化 word2vec embedding

import kashgari
from kashgari.tasks.classification import BiLSTM_Model
# from kashgari.embeddings import BERTEmbedding
#
# bert_embed = BERTEmbedding('chinese_L-12_H-768_A-12',
#                            task=kashgari.CLASSIFICATION,
#                            sequence_length=600)
model = BiLSTM_Model()
# 初始化 BERT embedding
# from kashgari.embeddings import BERTEmbedding
# # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600)
# #
# # # 使用 embedding 初始化模型
# # from kashgari.tasks.classification import CNNModel
# # model = CNNModel(emb edding)

import keras
import logging
model.fit(train_x,train_y,val_x,val_y)
model.evaluate(test_x, test_y)
model.save('./model_BILSTM')

Ejemplo n.º 4
0
test_x, test_y = read_data_file('topic_data/test.txt')
train_x, train_y = read_data_file('topic_data/train.txt')
val_x, val_y = read_data_file('topic_data/dev.txt')

# 初始化 word2vec embedding

import kashgari
# 初始化 word2vec embedding
from kashgari.embeddings import WordEmbedding

from kashgari.embeddings import GPT2Embedding
from kashgari.tasks.classification import BiLSTM_Model
#lstm-acc: 0.8291
embed = GPT2Embedding('GPT-2',
                           task=kashgari.CLASSIFICATION,
                           sequence_length=100)
model =BiLSTM_Model(embed)
# 初始化 BERT embedding
# from kashgari.embeddings import BERTEmbedding
# # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600)
# #
# # # 使用 embedding 初始化模型
# # from kashgari.tasks.classification import CNNModel
# # model = CNNModel(embedding)


model.fit(train_x, train_y, val_x, val_y, batch_size=16,epochs=5)
model.evaluate(test_x, test_y)
model.save('./topic_gpt-2+bilstm')

def main():
    # parser config
    config_file = "./config.ini"
    cp = ConfigParser()
    cp.read(config_file)

    # default config
    output_fold = cp["TRAIN"].get("output_fold")
    epochs = cp["TRAIN"].getint("epochs")
    batch_size = cp["TRAIN"].getint("batch_size")
    generator_workers = cp["TRAIN"].getint("generator_workers")
    output_weights_name = cp["TRAIN"].get("output_weights_name")
    sequence_length_max = cp["TRAIN"].getint("sequence_length_max")
    output_model_name = cp["TRAIN"].get("output_model_name")
    save_weights_only = cp["TRAIN"].getboolean("save_weights_only")
    cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode")
    base_lr = cp["TRAIN"].getfloat("base_lr")
    max_lr = cp["TRAIN"].getfloat("max_lr")
    file_train = cp["TRAIN"].get("file_train")
    file_valid = cp["TRAIN"].get("file_valid")
    file_test = cp["TRAIN"].get("file_test")

    today = datetime.date.today()
    formatted_today = today.strftime('%y%m%d')
    output_dir = os.path.join('experiments', formatted_today, output_fold)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_dir_src = os.path.join(output_dir, 'src')
    if not os.path.isdir(output_dir_src):
        os.makedirs(output_dir_src)
    print(f"backup config file to {output_dir_src}")
    shutil.copy(config_file, os.path.join(output_dir_src, os.path.split(config_file)[1]))
    train_file = os.path.basename(__file__)
    shutil.copy(train_file, os.path.join(output_dir_src, train_file))
    logging.basicConfig(level='DEBUG')
    bert_path = get_file('bert_sample_model',
                         "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
                         cache_dir=DATA_PATH,
                         untar=True)

    train_x, train_y = preprocess(file_train)
    validate_x, validate_y = preprocess(file_valid)
    test_x, test_y = preprocess(file_test)

    #'bert-large-cased'
    embedding = BERTEmbedding(bert_path,
                              sequence_length=sequence_length_max,
                              task=kashgari.CLASSIFICATION,
                              trainable=True,
                              layer_nums=4)
    #embedding = BERTEmbedding('/home/ys1/pretrained_models/BERT/Japanese_L-12_H-768_A-12_E-30_BPE/', sequence_length=sequence_length_max, task=kashgari.CLASSIFICATION)

    # 还可以选择 CNNModel CNNLSTMModel
    # model = BiGRU_Model(embedding)
    hyper = BiLSTM_Model.get_default_hyper_parameters()
    print(f'hyper parameters is:{hyper}')
    #hyper parameters is:{'layer_bi_lstm': {'units': 128, 'return_sequences': False}, 'layer_dense': {'activation': 'softmax'}}
    # hyper['layer_bi_lstm']['units'] = 32
    model = BiLSTM_Model(embedding, hyper_parameters=hyper)
    # model.build_model(train_x, train_y)
    # model.build_multi_gpu_model(gpus=2)
    # print(model.summary())

    if save_weights_only:
        model_weights = os.path.join(output_dir, output_weights_name)
    else:
        model_weights = os.path.join(output_dir, output_model_name)

    checkpoint = keras.callbacks.ModelCheckpoint(
        model_weights,
        save_weights_only=save_weights_only,
        save_best_only=True,
        verbose=1,
    )
    earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='min')
    csv_logger = keras.callbacks.CSVLogger(os.path.join(output_dir, 'training.csv'))
    batch_size_cycliclr = ceil(len(train_x)/batch_size)
    if cyclicLR_mode == 'exp_range':
        gamma = 0.99994
    else:
        gamma = 1.
    clr = CyclicLR(mode=cyclicLR_mode, step_size=batch_size_cycliclr, base_lr=base_lr, max_lr=max_lr, gamma=gamma)
    save_min_loss = SaveMinLoss(filepath=output_dir)
    tb = keras.callbacks.TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size, update_freq=1000)
    # 这是 Kashgari 内置回调函数,会在训练过程计算精确度,召回率和 F1
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=validate_x,
                                 valid_y=validate_y,
                                 step=5)
    callbacks = [
        eval_callback,
        checkpoint,
        tb,
        csv_logger,
        # clr,
        save_min_loss,
        earlystop,
    ]

    print("** start training **")
    model.fit(train_x,
              train_y,
              x_validate=validate_x,
              y_validate=validate_y,
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks,
              fit_kwargs={
                          'workers': generator_workers,
                          'use_multiprocessing': True,
                          'class_weight': 'auto',
                          }
              )

    model_path = os.path.join(output_dir, 'model')
    model.save(model_path)
    report_evaluate = model.evaluate(test_x, test_y, debug_info=True)

    with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f:
        f.write(f"The evaluate report is : \n{str(report_evaluate)}")
Ejemplo n.º 6
0
    MODELFILENAME = 'WORD2VEC'

elif args.embedding.lower() == 'pytorchbert':
    model, tokenizer = train_Pytorch_BERT(tweets,
                                          input_data['target'].tolist())
    MODELFILENAME = 'pytorch_BERT'

else:
    raise ValueError(
        "Parameter --embedding for the model type must be either \"BERT\" or \"W2V\"!"
    )

#Save Model
timestamp = str(datetime.now().strftime("%Y%m%d_%H-%M-%S"))
modelfilePath = 'models/' + MODELFILENAME + '-' + timestamp

if args.embedding.lower() == 'pytorchbert':
    os.mkdir(modelfilePath)
    output_model_file = os.path.join(modelfilePath, WEIGHTS_NAME)
    output_config_file = os.path.join(modelfilePath, CONFIG_NAME)
    torch.save(model.state_dict(), output_model_file)
    model.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(modelfilePath)
else:
    model.save(modelfilePath)

#%%
#CODE FOR Reloading PYTORCH MODEL --> WILL BE IMPLEMENTED IN THE FINAL APPLICATION SINCE ITS THE BEST MODEL
#model_state_dict = torch.load(output_model_file)
#loaded_model = BertForSequenceClassification.from_pretrained(modelfilePath, state_dict=model_state_dict, num_labels=2)