def do_train(df_train):
    # n 折
    skf = StratifiedKFold(n_splits=n, random_state=seed, shuffle=True)  # 设置 n折
    # skf.split(df_train['text'], df_train['label'])  划分数据,生成 train, valid数据
    # enumerate(data, 1) 表示下标从1开始,即 fold从1开始计算
    for fold, (train_idx, valid_idx) in enumerate(
            skf.split(df_train['content'], df_train['label_id']), 1):
        print(f'Fold {fold}')
        # 加载数据
        train_data = load_data(df_train.iloc[train_idx])
        valid_data = load_data(df_train.iloc[valid_idx])
        # 加入数据迭代器中
        train_generator = data_generator(train_data, batch_size, random=True)
        valid_generator = data_generator(valid_data, batch_size)

        model = build_model()  # 构建模型
        # strategy = tf.distribute.MirroredStrategy()
        # print('Number of devices: %d' % strategy.num_replicas_in_sync)  # 输出设备数量
        # with strategy.scope():
        #     model = build_model()
        #     model.summary()

        # 加入对抗训练
        adversarial_training(model, 'Embedding-Token', 0.5)  # 加入对抗训练
        # 回调函数
        callbacks = [
            Evaluator(valid_generator),  # 每个epoch结束时,就会执行验证
            EarlyStopping(
                monitor='val_f1', patience=5, verbose=1,
                mode='max'),  # 早期停止条件,监控val_f1值,如果5次都没有超过最佳f1,那么就停止训练
            ReduceLROnPlateau(monitor='val_f1',
                              factor=0.5,
                              patience=2,
                              verbose=1,
                              mode='max'),  # 当训练的模型停止提升的时候,就减少学习率,看是否能够继续提升
            ModelCheckpoint(
                f'weights-{fold}.h5',  # 保存路径 避免文件名被覆盖
                monitor='val_f1',
                save_weights_only=True,
                save_best_only=True,
                verbose=1,
                mode='max'),  # 模型检查点,进行模型的数据进行保存;只保存最新f1的那次数据,只保存权重
        ]
        # 模型训练
        model.fit_generator(train_generator.forfit(),
                            steps_per_epoch=len(train_generator),
                            epochs=epochs,
                            callbacks=callbacks,
                            validation_data=valid_generator.forfit(),
                            validation_steps=len(valid_generator))

        del model  # 删除模型
        K.clear_session()  # 清理 会话
Beispiel #2
0
def run_cv(nfolds,
           data,
           data_label,
           data_test,
           data_valid,
           epochs=10,
           date_str='1107'):
    skf = KFold(n_splits=nfolds, shuffle=True, random_state=48).split(train)
    #     train_model_pred = np.zeros((len(data), n_class))
    #     test_model_pred = np.zeros((len(data_test), n_class))

    for k, (train_fold, test_fold) in enumerate(skf):
        print('Fold: ', k)
        if k != 10:
            #         if k in [0,1,2]:
            '''数据部分'''
            # 数据划分
            train_data, valid_data, = [
                tuple(i) for i in list(np.array(train)[train_fold])
            ], [tuple(i) for i in list(np.array(train)[test_fold])]
            '''模型部分'''
            # 生成模型
            model, CRF = build_bert(num_labels)
            file_path = date_str + str(k) + '.weights'

            evaluator = Evaluator(valid_data, model, CRF, file_path)
            if not os.path.exists(file_path):
                train_generator = data_generator(train_data, batch_size)
                valid_generator = data_generator(valid_data, batch_size)

                model.fit_generator(train_generator.forfit(),
                                    steps_per_epoch=len(train_generator),
                                    validation_data=valid_generator.forfit(),
                                    validation_steps=len(valid_generator),
                                    epochs=epochs,
                                    verbose=1,
                                    callbacks=[evaluator])
                model.load_weights(file_path)
            else:
                model.load_weights(file_path)
            data_test['submit_' + str(k)] = data_test['context'].apply(
                lambda x: extract_arguments(x, model, CRF))
            print(data_test['submit_' + str(k)])
            print('Fold: ', sum(data_test['submit_' + str(k)].apply(len) > 0))
            #         data_valid['submit_'+str(k)] = data_valid['content'].apply(lambda x:extract_arguments(x,model,CRF))
            del model
            del CRF
            gc.collect()
            K.clear_session()
        else:
            continue
    return data_test, data_valid
def do_train(df_train):
    skf = StratifiedKFold(n_splits=n, random_state=SEED, shuffle=True)
    for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train['text'], df_train['label']), 1):
        print(f'Fold {fold}')

        train_data = load_data(df_train.iloc[trn_idx])
        valid_data = load_data(df_train.iloc[val_idx])

        train_generator = data_generator(train_data, batch_size, random=True)
        valid_generator = data_generator(valid_data, batch_size)

        model = build_model()

        adversarial_training(model, 'Embedding-Token', 0.5)

        callbacks = [
            Evaluator(valid_generator),
            EarlyStopping(
                monitor='val_f1',
                patience=5,
                verbose=1,
                mode='max'),
            ReduceLROnPlateau(
                monitor='val_f1',
                factor=0.5,
                patience=2,
                verbose=1,
                mode='max'),
            ModelCheckpoint(
                f'weights-{fold}.h5',
                monitor='val_f1',
                save_weights_only=True,
                save_best_only=True,
                verbose=1,
                mode='max'),
        ]

        model.fit_generator(
            train_generator.forfit(),
            steps_per_epoch=len(train_generator),
            epochs=epochs,
            callbacks=callbacks,
            validation_data=valid_generator.forfit(),
            validation_steps=len(valid_generator)
        )

        del model
        K.clear_session()
Beispiel #4
0
def do_train2(df_train):
    from sklearn.model_selection import train_test_split
    X_train, X_valid, y_train, y_valid = train_test_split(
        df_train['text'].values,
        df_train['label'].values,
        shuffle=True,
        test_size=0.1,
        random_state=2021,
        stratify=df_train['label'].values)

    train_data = load_data2(X_train, y_train)
    valid_data = load_data2(X_valid, y_valid)

    train_generator = data_generator(train_data, batch_size, random=True)
    valid_generator = data_generator(valid_data, batch_size)

    model = build_model()

    adversarial_training(model, 'Embedding-Token', 0.5)

    fold = 0
    callbacks = [
        Evaluator(valid_generator),
        EarlyStopping(monitor='val_f1', patience=5, verbose=1, mode='max'),
        ReduceLROnPlateau(monitor='val_f1',
                          factor=0.5,
                          patience=2,
                          verbose=1,
                          mode='max'),
        ModelCheckpoint(f'weights-{fold}.h5',
                        monitor='val_f1',
                        save_weights_only=True,
                        save_best_only=True,
                        verbose=1,
                        mode='max'),
    ]

    model.fit_generator(train_generator.forfit(),
                        steps_per_epoch=len(train_generator),
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=valid_generator.forfit(),
                        validation_steps=len(valid_generator))

    del model
    K.clear_session()
Beispiel #5
0
def do_train(mode='bert', filename='roberta', lastfour=False, LR=1e-5, DR=0.2, ext=False, batch_size=16):

    skf = StratifiedKFold(5, shuffle=True, random_state=2020)
    nfold = 1

    if(ext):
        data = np.concatenate([train_ext_data, valid_data], axis=0)
    else:
        data = np.concatenate([train_data, valid_data], axis=0)

    for train_index, valid_index in skf.split(data[:, :2], data[:, 2:].astype('int')):
        train = data[train_index, :]
        valid = data[valid_index, :]

        train_generator = data_generator(train, batch_size)
        valid_generator = data_generator(valid, batch_size)

        model = build_model(mode=mode, filename=filename, lastfour=lastfour, LR=LR, DR=DR)

        # 启用对抗训练只需要一行代码
        adversarial_training(model, 'Embedding-Token', 0.5)

        early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

        if(ext):
            checkpoint = ModelCheckpoint('../user_data/model_data/' + filename + '_weights/' + str(nfold) + '_ext.weights',
                                         monitor='val_loss', save_weights_only=True, save_best_only=True, verbose=1)
        else:
            checkpoint = ModelCheckpoint('../user_data/model_data/' + filename + '_weights/' + str(nfold) + '.weights',
                                         monitor='val_loss', save_weights_only=True, save_best_only=True, verbose=1)

        model.fit_generator(train_generator.forfit(),
                            steps_per_epoch=len(train_generator),
                            epochs=5,
                            validation_data=valid_generator.forfit(),
                            validation_steps=len(valid_generator),
                            callbacks=[early_stopping, checkpoint],
                            verbose=2,
                            )

        del model
        K.clear_session()
        nfold += 1
Beispiel #6
0
#! -*- coding:utf-8 -*-
# 情感分析例子,加载albert_zh权重(https://github.com/brightmart/albert_zh)
import datetime

from bert4keras.backend import keras, set_gelu, K, tf
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator, to_array
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
from config.pathconfig.dirconfig import albert_dir, data_dir, model_dir

K.clear_session()
tf.random.set_seed(2020)
set_gelu('tanh')  # 切换gelu版本

num_classes = 2
maxlen = 128
batch_size = 64
epochs = 10
config_path = albert_dir + '/albert_config.json'
checkpoint_path = albert_dir + '/model.ckpt-best'
dict_path = albert_dir + '/vocab_chinese.txt'


def load_data(filename):
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            text, label = l.strip().split('\t')
 def albert_model(self):
     K.clear_session()
     del self._model