Ejemplo n.º 1
0
def build_transformer_model_for_pretraining():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert, train_model, loss = build_transformer_model_with_mlm()

    # 优化器
    optimizer = extend_with_weight_decay(Adam)
    if which_optimizer == 'lamb':
        optimizer = extend_with_layer_adaptation(optimizer)
    optimizer = extend_with_piecewise_linear_lr(optimizer)
    optimizer_params = {
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'weight_decay_rate': weight_decay_rate,
        'exclude_from_weight_decay': exclude_from_weight_decay,
        'bias_correction': False,
    }
    if grad_accum_steps > 1:
        optimizer = extend_with_gradient_accumulation(optimizer)
        optimizer_params['grad_accum_steps'] = grad_accum_steps
    optimizer = optimizer(**optimizer_params)

    # 模型定型
    train_model.compile(loss=loss, optimizer=optimizer)

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model
Ejemplo n.º 2
0
def build_model():
    """ 模型构建 """
    token_ids = Input(shape=(max_segment, maxlen), dtype='int32')
    segment_ids = Input(shape=(max_segment, maxlen), dtype='int32')

    input_mask = Masking(mask_value=0)(token_ids)  # 对输入token_ids做masking
    # k.any()先归约,然后再进行类型变换
    # 可以转换一个 Keras 变量,但它仍然返回一个 Keras 张量(类型变换)
    input_mask = Lambda(lambda x: K.cast(K.any(x, axis=2, keepdims=True),
                                         'float32'))(input_mask)

    # 重构 维度 把 batch, token_ids 合并成一个维度
    token_ids1 = Lambda(lambda x: K.reshape(x, shape=(-1, maxlen)))(token_ids)
    segment_ids1 = Lambda(lambda x: K.reshape(x, shape=(-1, maxlen)))(
        segment_ids)

    # 加载预训练模型
    bert = build_transformer_model(
        config_path=config_path,
        checkpoint_path=checkpoint_path,
        return_keras_model=False,
    )
    output = bert.model([token_ids1, segment_ids1])
    output = Lambda(lambda x: x[:, 0])(output)  # 取CLS 只取第一列
    # 维度重构
    output = Lambda(lambda x: K.reshape(
        x, shape=(-1, max_segment, output.shape[-1])))(output)
    output = Multiply()([output,
                         input_mask])  # 把输出和 input_mask拼到一起,然后输出一个张量,维度不变
    output = Dropout(drop)(output)

    output = Attention(output.shape[-1].value)([output, input_mask])  # 使用注意力
    output = Dropout(drop)(output)
    # FC 线性层
    output = Dense(units=num_classes,
                   activation='softmax',
                   kernel_initializer=bert.initializer)(output)

    model = keras.models.Model([token_ids, segment_ids], output)
    # 设置多GPU
    # 设置优化器,优化参数
    optimizer_params = {
        'learning_rate': lr,
        'grad_accum_steps': grad_accum_steps
    }

    optimizer = extend_with_gradient_accumulation(Adam)  # 加入梯度累积
    optimizer = optimizer(**optimizer_params)

    # multi gpu

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=optimizer,
        metrics=['sparse_categorical_accuracy'],
    )

    return model
Ejemplo n.º 3
0
def build_model():
    """构建模型。"""
    token_ids = Input(shape=(max_segment, maxlen), dtype='int32')
    segment_ids = Input(shape=(max_segment, maxlen), dtype='int32')

    input_mask = Masking(mask_value=0)(token_ids)
    input_mask = Lambda(
        lambda x: K.cast(K.any(x, axis=2, keepdims=True), 'float32')
    )(input_mask)

    token_ids1 = Lambda(
        lambda x: K.reshape(x, shape=(-1, maxlen))
    )(token_ids)
    segment_ids1 = Lambda(
        lambda x: K.reshape(x, shape=(-1, maxlen))
    )(segment_ids)

    # 加载预训练模型
    bert = build_transformer_model(
        config_path=config_path,
        checkpoint_path=checkpoint_path,
        return_keras_model=False,
    )
    output = bert.model([token_ids1, segment_ids1])
    output = Lambda(lambda x: x[:, 0])(output)
    output = Lambda(
        lambda x: K.reshape(x, shape=(-1, max_segment, output.shape[-1]))
    )(output)
    output = Multiply()([output, input_mask])
    output = Dropout(drop)(output)

    output = Attention(output.shape[-1].value)([output, input_mask])
    output = Dropout(drop)(output)

    output = Dense(
        units=num_classes,
        activation='softmax',
        kernel_initializer=bert.initializer
    )(output)

    model = keras.models.Model([token_ids, segment_ids], output)

    optimizer_params = {
        'learning_rate': lr,
        'grad_accum_steps': grad_accum_steps
    }
    optimizer = extend_with_gradient_accumulation(Adam)
    optimizer = optimizer(**optimizer_params)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=optimizer,
        metrics=['sparse_categorical_accuracy'],
    )

    return model
Ejemplo n.º 4
0
def build_train_bert_model():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert = build_bert_model(config_path,
                            with_mlm='linear',
                            application='lm',
                            return_keras_model=False)
    token_ids = bert.model.input[0]
    proba = bert.model.output

    def lm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        y_true = y_true[:, 1:]
        y_pred = y_pred[:, :-1]
        mask = mask[:, 1:]
        loss = K.sparse_categorical_crossentropy(y_true,
                                                 y_pred,
                                                 from_logits=True)
        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
        return loss

    def lm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        y_true = K.cast(y_true, K.floatx())
        y_true = y_true[:, 1:]
        y_pred = y_pred[:, :-1]
        mask = mask[:, 1:]
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
        return acc

    mask = bert.model.get_layer('Sequence-Mask').output
    loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba, mask])
    acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba, mask])

    train_model = Model(bert.model.inputs, [loss, acc])

    # 优化器
    optimizer = extend_with_weight_decay(Adam)
    if which_optimizer == 'lamb':
        optimizer = extend_with_layer_adaptation(optimizer)
    optimizer = extend_with_piecewise_linear_lr(optimizer)
    optimizer_params = {
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'weight_decay_rate': weight_decay_rate,
        'exclude_from_weight_decay': exclude_from_weight_decay,
        'bias_correction': False,
    }
    if grad_accum_steps > 1:
        optimizer = extend_with_gradient_accumulation(optimizer)
        optimizer_params['grad_accum_steps'] = grad_accum_steps
    optimizer = optimizer(**optimizer_params)

    # 模型定型
    train_model.compile(
        loss={
            'lm_loss': lambda y_true, y_pred: y_pred,
            'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        },
        optimizer=optimizer,
    )

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model
Ejemplo n.º 5
0
model = build_transformer_model(
    config_path,
    checkpoint_path,
    model='nezha',
    application='lm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
    compound_tokens=compound_tokens,  # 要扩充的词表
)

output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])

model = Model(model.inputs, output)
model.summary()

AdamW = extend_with_weight_decay(Adam, 'AdamW')
AdamWG = extend_with_gradient_accumulation(AdamW, 'AdamWG')
optimizer = AdamWG(learning_rate=2e-5,
                   weight_decay_rate=0.01,
                   exclude_from_weight_decay=['Norm', 'bias'],
                   grad_accum_steps=16)
model.compile(optimizer=optimizer)


class ChatBot(AutoRegressiveDecoder):
    """基于随机采样对话机器人
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        curr_segment_ids = np.ones_like(output_ids) - segment_ids[0, -1]
Ejemplo n.º 6
0
def build_train_bert_model():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert = build_bert_model(config_path, with_mlm='linear', return_keras_model=False)
    bert_model = bert.model
    proba = bert_model.output

    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id
    is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
        loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon())
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        y_true = K.cast(y_true, K.floatx())
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon())
        return acc

    loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])

    train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc])

    # 优化器
    optimizer = extend_with_weight_decay(Adam)
    if which_optimizer == 'lamb':
        optimizer = extend_with_layer_adaptation(optimizer)
    optimizer = extend_with_piecewise_linear_lr(optimizer)
    optimizer_params = {
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'weight_decay_rate': weight_decay_rate,
        'exclude_from_weight_decay': exclude_from_weight_decay,
        'bias_correction': False,
    }
    if grad_accum_steps > 1:
        optimizer = extend_with_gradient_accumulation(optimizer)
        optimizer_params['grad_accum_steps'] = grad_accum_steps
    optimizer = optimizer(**optimizer_params)

    # 模型定型
    train_model.compile(
        loss={
            'mlm_loss': lambda y_true, y_pred: y_pred,
            'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        },
        optimizer=optimizer,
    )

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model
Ejemplo n.º 7
0
-------------------------------------------------
   Change Activity:
                   2021/3/30:
-------------------------------------------------
"""

from tensorflow.keras.optimizers import (Adadelta, Adagrad, Adamax, Nadam,
                                         RMSprop, SGD, Adam)
from bert4keras.optimizers import extend_with_exponential_moving_average, extend_with_piecewise_linear_lr, \
    extend_with_gradient_accumulation

AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
# 变成带分段线性学习率的Adam
AdamLR = extend_with_piecewise_linear_lr(Adam, 'AdamLR')
# 梯度累积的Adam
AdamAcc = extend_with_gradient_accumulation(Adam, 'AdamAcc')
# 梯度累积的分段线性学习率Adam
AdamAccLR = extend_with_piecewise_linear_lr(AdamAcc, 'AdamAccLR')


class OptimizerFactory:
    _BUILDERS = {
        'sgd': SGD,
        'rmsprop': RMSprop,
        'adagrad': Adagrad,
        'adadelta': Adadelta,
        'adam': Adam,
        'adamax': Adamax,
        'nadam': Nadam,
        "adamema": AdamEMA,
        "adam_lr": AdamLR,
Ejemplo n.º 8
0
        model='roformer',
        with_mlm='linear',
        ignore_invalid_weights=True,
        return_keras_model=False
    )
    model = bert.model

    # 训练用模型
    y_in = keras.layers.Input(shape=(None,), name='Input-Label')
    outputs = CrossEntropy(1)([y_in, model.output])

    train_model = keras.models.Model(model.inputs + [y_in], outputs)

    AdamW = extend_with_weight_decay(Adam, name='AdamW')
    AdamWLR = extend_with_piecewise_linear_lr(AdamW, name='AdamWLR')
    AdamWLRG = extend_with_gradient_accumulation(AdamWLR, name='AdamWLRG')
    optimizer = AdamWLRG(
        learning_rate=1e-5,
        weight_decay_rate=0.01,
        exclude_from_weight_decay=['Norm', 'bias'],
        grad_accum_steps=4,
        lr_schedule={20000: 1}
    )
    train_model.compile(optimizer=optimizer)
    train_model.summary()
    bert.load_weights_from_checkpoint(checkpoint_path)


class Evaluator(keras.callbacks.Callback):
    """训练回调
    """