def build_transformer_model_for_pretraining(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert, train_model, loss = build_transformer_model_with_mlm() # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile(loss=loss, optimizer=optimizer) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
def build_model(): """ 模型构建 """ token_ids = Input(shape=(max_segment, maxlen), dtype='int32') segment_ids = Input(shape=(max_segment, maxlen), dtype='int32') input_mask = Masking(mask_value=0)(token_ids) # 对输入token_ids做masking # k.any()先归约,然后再进行类型变换 # 可以转换一个 Keras 变量,但它仍然返回一个 Keras 张量(类型变换) input_mask = Lambda(lambda x: K.cast(K.any(x, axis=2, keepdims=True), 'float32'))(input_mask) # 重构 维度 把 batch, token_ids 合并成一个维度 token_ids1 = Lambda(lambda x: K.reshape(x, shape=(-1, maxlen)))(token_ids) segment_ids1 = Lambda(lambda x: K.reshape(x, shape=(-1, maxlen)))( segment_ids) # 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, ) output = bert.model([token_ids1, segment_ids1]) output = Lambda(lambda x: x[:, 0])(output) # 取CLS 只取第一列 # 维度重构 output = Lambda(lambda x: K.reshape( x, shape=(-1, max_segment, output.shape[-1])))(output) output = Multiply()([output, input_mask]) # 把输出和 input_mask拼到一起,然后输出一个张量,维度不变 output = Dropout(drop)(output) output = Attention(output.shape[-1].value)([output, input_mask]) # 使用注意力 output = Dropout(drop)(output) # FC 线性层 output = Dense(units=num_classes, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model([token_ids, segment_ids], output) # 设置多GPU # 设置优化器,优化参数 optimizer_params = { 'learning_rate': lr, 'grad_accum_steps': grad_accum_steps } optimizer = extend_with_gradient_accumulation(Adam) # 加入梯度累积 optimizer = optimizer(**optimizer_params) # multi gpu model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy'], ) return model
def build_model(): """构建模型。""" token_ids = Input(shape=(max_segment, maxlen), dtype='int32') segment_ids = Input(shape=(max_segment, maxlen), dtype='int32') input_mask = Masking(mask_value=0)(token_ids) input_mask = Lambda( lambda x: K.cast(K.any(x, axis=2, keepdims=True), 'float32') )(input_mask) token_ids1 = Lambda( lambda x: K.reshape(x, shape=(-1, maxlen)) )(token_ids) segment_ids1 = Lambda( lambda x: K.reshape(x, shape=(-1, maxlen)) )(segment_ids) # 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, ) output = bert.model([token_ids1, segment_ids1]) output = Lambda(lambda x: x[:, 0])(output) output = Lambda( lambda x: K.reshape(x, shape=(-1, max_segment, output.shape[-1])) )(output) output = Multiply()([output, input_mask]) output = Dropout(drop)(output) output = Attention(output.shape[-1].value)([output, input_mask]) output = Dropout(drop)(output) output = Dense( units=num_classes, activation='softmax', kernel_initializer=bert.initializer )(output) model = keras.models.Model([token_ids, segment_ids], output) optimizer_params = { 'learning_rate': lr, 'grad_accum_steps': grad_accum_steps } optimizer = extend_with_gradient_accumulation(Adam) optimizer = optimizer(**optimizer_params) model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy'], ) return model
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', application='lm', return_keras_model=False) token_ids = bert.model.input[0] proba = bert.model.output def lm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = y_true[:, 1:] y_pred = y_pred[:, :-1] mask = mask[:, 1:] loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def lm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = K.cast(y_true, K.floatx()) y_true = y_true[:, 1:] y_pred = y_pred[:, :-1] mask = mask[:, 1:] acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc mask = bert.model.get_layer('Sequence-Mask').output loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba, mask]) acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba, mask]) train_model = Model(bert.model.inputs, [loss, acc]) # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile( loss={ 'lm_loss': lambda y_true, y_pred: y_pred, 'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
model = build_transformer_model( config_path, checkpoint_path, model='nezha', application='lm', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 compound_tokens=compound_tokens, # 要扩充的词表 ) output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) model = Model(model.inputs, output) model.summary() AdamW = extend_with_weight_decay(Adam, 'AdamW') AdamWG = extend_with_gradient_accumulation(AdamW, 'AdamWG') optimizer = AdamWG(learning_rate=2e-5, weight_decay_rate=0.01, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=16) model.compile(optimizer=optimizer) class ChatBot(AutoRegressiveDecoder): """基于随机采样对话机器人 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) curr_segment_ids = np.ones_like(output_ids) - segment_ids[0, -1]
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', return_keras_model=False) bert_model = bert.model proba = bert_model.output # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon()) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon()) return acc loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc]) # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile( loss={ 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
------------------------------------------------- Change Activity: 2021/3/30: ------------------------------------------------- """ from tensorflow.keras.optimizers import (Adadelta, Adagrad, Adamax, Nadam, RMSprop, SGD, Adam) from bert4keras.optimizers import extend_with_exponential_moving_average, extend_with_piecewise_linear_lr, \ extend_with_gradient_accumulation AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') # 变成带分段线性学习率的Adam AdamLR = extend_with_piecewise_linear_lr(Adam, 'AdamLR') # 梯度累积的Adam AdamAcc = extend_with_gradient_accumulation(Adam, 'AdamAcc') # 梯度累积的分段线性学习率Adam AdamAccLR = extend_with_piecewise_linear_lr(AdamAcc, 'AdamAccLR') class OptimizerFactory: _BUILDERS = { 'sgd': SGD, 'rmsprop': RMSprop, 'adagrad': Adagrad, 'adadelta': Adadelta, 'adam': Adam, 'adamax': Adamax, 'nadam': Nadam, "adamema": AdamEMA, "adam_lr": AdamLR,
model='roformer', with_mlm='linear', ignore_invalid_weights=True, return_keras_model=False ) model = bert.model # 训练用模型 y_in = keras.layers.Input(shape=(None,), name='Input-Label') outputs = CrossEntropy(1)([y_in, model.output]) train_model = keras.models.Model(model.inputs + [y_in], outputs) AdamW = extend_with_weight_decay(Adam, name='AdamW') AdamWLR = extend_with_piecewise_linear_lr(AdamW, name='AdamWLR') AdamWLRG = extend_with_gradient_accumulation(AdamWLR, name='AdamWLRG') optimizer = AdamWLRG( learning_rate=1e-5, weight_decay_rate=0.01, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4, lr_schedule={20000: 1} ) train_model.compile(optimizer=optimizer) train_model.summary() bert.load_weights_from_checkpoint(checkpoint_path) class Evaluator(keras.callbacks.Callback): """训练回调 """