train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') # 初始化 word2vec embedding import kashgari from kashgari.tasks.classification import BiLSTM_Model from kashgari.embeddings import BERTEmbedding bert_embed = BERTEmbedding('baidu_ernie', task=kashgari.CLASSIFICATION, sequence_length=600) model = BiLSTM_Model(bert_embed) # 初始化 BERT embedding # from kashgari.embeddings import BERTEmbedding # # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600) # # # # # 使用 embedding 初始化模型 # # from kashgari.tasks.classification import CNNModel # # model = CNNModel(embedding) #可视化 目录下命令可以$ tensorboard --log-dir logs import keras model.fit(train_x, train_y, val_x, val_y,epochs=5) model.evaluate(test_x, test_y) model.save('./model')
# 准备训练测试数据集 train_x = list(df['cutted'][:int(len(df) * 0.7)]) train_y = list(df['label'][:int(len(df) * 0.7)]) valid_x = list(df['cutted'][int(len(df) * 0.7):int(len(df) * 0.85)]) valid_y = list(df['label'][int(len(df) * 0.7):int(len(df) * 0.85)]) test_x = list(df['cutted'][int(len(df) * 0.85):]) test_y = list(df['label'][int(len(df) * 0.85):]) import kashgari from kashgari.embeddings import BERTEmbedding from kashgari.tasks.classification import BiLSTM_Model BERT_PATH = '<BERT/ERNIE 模型路径>' # 初始化 Embedding embed = BERTEmbedding(BERT_PATH, task=kashgari.CLASSIFICATION, sequence_length=64) # 使用 embedding 初始化模型 model = BiLSTM_Model(embed) # 先只训练一轮 model.fit(train_x, train_y, valid_x, valid_y, batch_size=1024, epochs=1) model.evaluate(test_x, test_y, batch_size=512) model.save('bert_model') import kashgari kashgari.utils.convert_to_saved_model(model, 'tf_bert_model', version=1)
return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') # 初始化 word2vec embedding import kashgari from kashgari.tasks.classification import BiLSTM_Model # from kashgari.embeddings import BERTEmbedding # # bert_embed = BERTEmbedding('chinese_L-12_H-768_A-12', # task=kashgari.CLASSIFICATION, # sequence_length=600) model = BiLSTM_Model() # 初始化 BERT embedding # from kashgari.embeddings import BERTEmbedding # # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600) # # # # # 使用 embedding 初始化模型 # # from kashgari.tasks.classification import CNNModel # # model = CNNModel(emb edding) import keras import logging model.fit(train_x,train_y,val_x,val_y) model.evaluate(test_x, test_y) model.save('./model_BILSTM')
test_x, test_y = read_data_file('topic_data/test.txt') train_x, train_y = read_data_file('topic_data/train.txt') val_x, val_y = read_data_file('topic_data/dev.txt') # 初始化 word2vec embedding import kashgari # 初始化 word2vec embedding from kashgari.embeddings import WordEmbedding from kashgari.embeddings import GPT2Embedding from kashgari.tasks.classification import BiLSTM_Model #lstm-acc: 0.8291 embed = GPT2Embedding('GPT-2', task=kashgari.CLASSIFICATION, sequence_length=100) model =BiLSTM_Model(embed) # 初始化 BERT embedding # from kashgari.embeddings import BERTEmbedding # # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600) # # # # # 使用 embedding 初始化模型 # # from kashgari.tasks.classification import CNNModel # # model = CNNModel(embedding) model.fit(train_x, train_y, val_x, val_y, batch_size=16,epochs=5) model.evaluate(test_x, test_y) model.save('./topic_gpt-2+bilstm')
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_fold = cp["TRAIN"].get("output_fold") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") generator_workers = cp["TRAIN"].getint("generator_workers") output_weights_name = cp["TRAIN"].get("output_weights_name") sequence_length_max = cp["TRAIN"].getint("sequence_length_max") output_model_name = cp["TRAIN"].get("output_model_name") save_weights_only = cp["TRAIN"].getboolean("save_weights_only") cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode") base_lr = cp["TRAIN"].getfloat("base_lr") max_lr = cp["TRAIN"].getfloat("max_lr") file_train = cp["TRAIN"].get("file_train") file_valid = cp["TRAIN"].get("file_valid") file_test = cp["TRAIN"].get("file_test") today = datetime.date.today() formatted_today = today.strftime('%y%m%d') output_dir = os.path.join('experiments', formatted_today, output_fold) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_dir_src = os.path.join(output_dir, 'src') if not os.path.isdir(output_dir_src): os.makedirs(output_dir_src) print(f"backup config file to {output_dir_src}") shutil.copy(config_file, os.path.join(output_dir_src, os.path.split(config_file)[1])) train_file = os.path.basename(__file__) shutil.copy(train_file, os.path.join(output_dir_src, train_file)) logging.basicConfig(level='DEBUG') bert_path = get_file('bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) train_x, train_y = preprocess(file_train) validate_x, validate_y = preprocess(file_valid) test_x, test_y = preprocess(file_test) #'bert-large-cased' embedding = BERTEmbedding(bert_path, sequence_length=sequence_length_max, task=kashgari.CLASSIFICATION, trainable=True, layer_nums=4) #embedding = BERTEmbedding('/home/ys1/pretrained_models/BERT/Japanese_L-12_H-768_A-12_E-30_BPE/', sequence_length=sequence_length_max, task=kashgari.CLASSIFICATION) # 还可以选择 CNNModel CNNLSTMModel # model = BiGRU_Model(embedding) hyper = BiLSTM_Model.get_default_hyper_parameters() print(f'hyper parameters is:{hyper}') #hyper parameters is:{'layer_bi_lstm': {'units': 128, 'return_sequences': False}, 'layer_dense': {'activation': 'softmax'}} # hyper['layer_bi_lstm']['units'] = 32 model = BiLSTM_Model(embedding, hyper_parameters=hyper) # model.build_model(train_x, train_y) # model.build_multi_gpu_model(gpus=2) # print(model.summary()) if save_weights_only: model_weights = os.path.join(output_dir, output_weights_name) else: model_weights = os.path.join(output_dir, output_model_name) checkpoint = keras.callbacks.ModelCheckpoint( model_weights, save_weights_only=save_weights_only, save_best_only=True, verbose=1, ) earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='min') csv_logger = keras.callbacks.CSVLogger(os.path.join(output_dir, 'training.csv')) batch_size_cycliclr = ceil(len(train_x)/batch_size) if cyclicLR_mode == 'exp_range': gamma = 0.99994 else: gamma = 1. clr = CyclicLR(mode=cyclicLR_mode, step_size=batch_size_cycliclr, base_lr=base_lr, max_lr=max_lr, gamma=gamma) save_min_loss = SaveMinLoss(filepath=output_dir) tb = keras.callbacks.TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size, update_freq=1000) # 这是 Kashgari 内置回调函数,会在训练过程计算精确度,召回率和 F1 eval_callback = EvalCallBack(kash_model=model, valid_x=validate_x, valid_y=validate_y, step=5) callbacks = [ eval_callback, checkpoint, tb, csv_logger, # clr, save_min_loss, earlystop, ] print("** start training **") model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=epochs, batch_size=batch_size, callbacks=callbacks, fit_kwargs={ 'workers': generator_workers, 'use_multiprocessing': True, 'class_weight': 'auto', } ) model_path = os.path.join(output_dir, 'model') model.save(model_path) report_evaluate = model.evaluate(test_x, test_y, debug_info=True) with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f: f.write(f"The evaluate report is : \n{str(report_evaluate)}")
MODELFILENAME = 'WORD2VEC' elif args.embedding.lower() == 'pytorchbert': model, tokenizer = train_Pytorch_BERT(tweets, input_data['target'].tolist()) MODELFILENAME = 'pytorch_BERT' else: raise ValueError( "Parameter --embedding for the model type must be either \"BERT\" or \"W2V\"!" ) #Save Model timestamp = str(datetime.now().strftime("%Y%m%d_%H-%M-%S")) modelfilePath = 'models/' + MODELFILENAME + '-' + timestamp if args.embedding.lower() == 'pytorchbert': os.mkdir(modelfilePath) output_model_file = os.path.join(modelfilePath, WEIGHTS_NAME) output_config_file = os.path.join(modelfilePath, CONFIG_NAME) torch.save(model.state_dict(), output_model_file) model.config.to_json_file(output_config_file) tokenizer.save_vocabulary(modelfilePath) else: model.save(modelfilePath) #%% #CODE FOR Reloading PYTORCH MODEL --> WILL BE IMPLEMENTED IN THE FINAL APPLICATION SINCE ITS THE BEST MODEL #model_state_dict = torch.load(output_model_file) #loaded_model = BertForSequenceClassification.from_pretrained(modelfilePath, state_dict=model_state_dict, num_labels=2)