def do_train(df_train): # n 折 skf = StratifiedKFold(n_splits=n, random_state=seed, shuffle=True) # 设置 n折 # skf.split(df_train['text'], df_train['label']) 划分数据,生成 train, valid数据 # enumerate(data, 1) 表示下标从1开始,即 fold从1开始计算 for fold, (train_idx, valid_idx) in enumerate( skf.split(df_train['content'], df_train['label_id']), 1): print(f'Fold {fold}') # 加载数据 train_data = load_data(df_train.iloc[train_idx]) valid_data = load_data(df_train.iloc[valid_idx]) # 加入数据迭代器中 train_generator = data_generator(train_data, batch_size, random=True) valid_generator = data_generator(valid_data, batch_size) model = build_model() # 构建模型 # strategy = tf.distribute.MirroredStrategy() # print('Number of devices: %d' % strategy.num_replicas_in_sync) # 输出设备数量 # with strategy.scope(): # model = build_model() # model.summary() # 加入对抗训练 adversarial_training(model, 'Embedding-Token', 0.5) # 加入对抗训练 # 回调函数 callbacks = [ Evaluator(valid_generator), # 每个epoch结束时,就会执行验证 EarlyStopping( monitor='val_f1', patience=5, verbose=1, mode='max'), # 早期停止条件,监控val_f1值,如果5次都没有超过最佳f1,那么就停止训练 ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=2, verbose=1, mode='max'), # 当训练的模型停止提升的时候,就减少学习率,看是否能够继续提升 ModelCheckpoint( f'weights-{fold}.h5', # 保存路径 避免文件名被覆盖 monitor='val_f1', save_weights_only=True, save_best_only=True, verbose=1, mode='max'), # 模型检查点,进行模型的数据进行保存;只保存最新f1的那次数据,只保存权重 ] # 模型训练 model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=callbacks, validation_data=valid_generator.forfit(), validation_steps=len(valid_generator)) del model # 删除模型 K.clear_session() # 清理 会话
def run_cv(nfolds, data, data_label, data_test, data_valid, epochs=10, date_str='1107'): skf = KFold(n_splits=nfolds, shuffle=True, random_state=48).split(train) # train_model_pred = np.zeros((len(data), n_class)) # test_model_pred = np.zeros((len(data_test), n_class)) for k, (train_fold, test_fold) in enumerate(skf): print('Fold: ', k) if k != 10: # if k in [0,1,2]: '''数据部分''' # 数据划分 train_data, valid_data, = [ tuple(i) for i in list(np.array(train)[train_fold]) ], [tuple(i) for i in list(np.array(train)[test_fold])] '''模型部分''' # 生成模型 model, CRF = build_bert(num_labels) file_path = date_str + str(k) + '.weights' evaluator = Evaluator(valid_data, model, CRF, file_path) if not os.path.exists(file_path): train_generator = data_generator(train_data, batch_size) valid_generator = data_generator(valid_data, batch_size) model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), validation_data=valid_generator.forfit(), validation_steps=len(valid_generator), epochs=epochs, verbose=1, callbacks=[evaluator]) model.load_weights(file_path) else: model.load_weights(file_path) data_test['submit_' + str(k)] = data_test['context'].apply( lambda x: extract_arguments(x, model, CRF)) print(data_test['submit_' + str(k)]) print('Fold: ', sum(data_test['submit_' + str(k)].apply(len) > 0)) # data_valid['submit_'+str(k)] = data_valid['content'].apply(lambda x:extract_arguments(x,model,CRF)) del model del CRF gc.collect() K.clear_session() else: continue return data_test, data_valid
def do_train(df_train): skf = StratifiedKFold(n_splits=n, random_state=SEED, shuffle=True) for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train['text'], df_train['label']), 1): print(f'Fold {fold}') train_data = load_data(df_train.iloc[trn_idx]) valid_data = load_data(df_train.iloc[val_idx]) train_generator = data_generator(train_data, batch_size, random=True) valid_generator = data_generator(valid_data, batch_size) model = build_model() adversarial_training(model, 'Embedding-Token', 0.5) callbacks = [ Evaluator(valid_generator), EarlyStopping( monitor='val_f1', patience=5, verbose=1, mode='max'), ReduceLROnPlateau( monitor='val_f1', factor=0.5, patience=2, verbose=1, mode='max'), ModelCheckpoint( f'weights-{fold}.h5', monitor='val_f1', save_weights_only=True, save_best_only=True, verbose=1, mode='max'), ] model.fit_generator( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=callbacks, validation_data=valid_generator.forfit(), validation_steps=len(valid_generator) ) del model K.clear_session()
def do_train2(df_train): from sklearn.model_selection import train_test_split X_train, X_valid, y_train, y_valid = train_test_split( df_train['text'].values, df_train['label'].values, shuffle=True, test_size=0.1, random_state=2021, stratify=df_train['label'].values) train_data = load_data2(X_train, y_train) valid_data = load_data2(X_valid, y_valid) train_generator = data_generator(train_data, batch_size, random=True) valid_generator = data_generator(valid_data, batch_size) model = build_model() adversarial_training(model, 'Embedding-Token', 0.5) fold = 0 callbacks = [ Evaluator(valid_generator), EarlyStopping(monitor='val_f1', patience=5, verbose=1, mode='max'), ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=2, verbose=1, mode='max'), ModelCheckpoint(f'weights-{fold}.h5', monitor='val_f1', save_weights_only=True, save_best_only=True, verbose=1, mode='max'), ] model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=callbacks, validation_data=valid_generator.forfit(), validation_steps=len(valid_generator)) del model K.clear_session()
def do_train(mode='bert', filename='roberta', lastfour=False, LR=1e-5, DR=0.2, ext=False, batch_size=16): skf = StratifiedKFold(5, shuffle=True, random_state=2020) nfold = 1 if(ext): data = np.concatenate([train_ext_data, valid_data], axis=0) else: data = np.concatenate([train_data, valid_data], axis=0) for train_index, valid_index in skf.split(data[:, :2], data[:, 2:].astype('int')): train = data[train_index, :] valid = data[valid_index, :] train_generator = data_generator(train, batch_size) valid_generator = data_generator(valid, batch_size) model = build_model(mode=mode, filename=filename, lastfour=lastfour, LR=LR, DR=DR) # 启用对抗训练只需要一行代码 adversarial_training(model, 'Embedding-Token', 0.5) early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=1) if(ext): checkpoint = ModelCheckpoint('../user_data/model_data/' + filename + '_weights/' + str(nfold) + '_ext.weights', monitor='val_loss', save_weights_only=True, save_best_only=True, verbose=1) else: checkpoint = ModelCheckpoint('../user_data/model_data/' + filename + '_weights/' + str(nfold) + '.weights', monitor='val_loss', save_weights_only=True, save_best_only=True, verbose=1) model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=5, validation_data=valid_generator.forfit(), validation_steps=len(valid_generator), callbacks=[early_stopping, checkpoint], verbose=2, ) del model K.clear_session() nfold += 1
#! -*- coding:utf-8 -*- # 情感分析例子,加载albert_zh权重(https://github.com/brightmart/albert_zh) import datetime from bert4keras.backend import keras, set_gelu, K, tf from bert4keras.tokenizers import Tokenizer from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator, to_array from bert4keras.snippets import open from keras.layers import Lambda, Dense from config.pathconfig.dirconfig import albert_dir, data_dir, model_dir K.clear_session() tf.random.set_seed(2020) set_gelu('tanh') # 切换gelu版本 num_classes = 2 maxlen = 128 batch_size = 64 epochs = 10 config_path = albert_dir + '/albert_config.json' checkpoint_path = albert_dir + '/model.ckpt-best' dict_path = albert_dir + '/vocab_chinese.txt' def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for l in f: text, label = l.strip().split('\t')
def albert_model(self): K.clear_session() del self._model