p_token_ids, p_segment_ids = tokenizer.encode(passage,
                                                          max_length=max_p_len)
            token_ids = p_token_ids + qa_token_ids[1:]
            segment_ids = p_segment_ids + qa_segment_ids[1:]
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
    application='seq2seq',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
)
model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_true = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y_pred = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))

Beispiel #2
0
    def build_input(self):
        """Build input placeholder and prepare embedding for ner model.

        Returns: Tuples of 2 tensor:
            1). Input tensor(s), depending whether using multiple inputs;
            2). Embedding tensor, which will be passed to following layers of ner models.

        """
        model_inputs = []
        input_embed = []

        # TODO: consider masking
        if self.use_char:
            if self.char_embeddings is not None:
                char_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.char_vocab_size,
                    output_dim=self.char_embed_dim,
                    weights=[self.char_embeddings],
                    trainable=self.char_embed_trainable)
            else:
                char_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.char_vocab_size,
                    output_dim=self.char_embed_dim)
            input_char = tf.keras.layers.Input(shape=(self.max_len, ))
            model_inputs.append(input_char)

            char_embed = char_embedding_layer(input_char)
            input_embed.append(
                tf.keras.layers.SpatialDropout1D(self.dropout)(char_embed))

        if self.use_bert:
            bert_model = build_bert_model(
                config_path=self.bert_config_file,
                checkpoint_path=self.bert_checkpoint_file)
            if not self.bert_trainable:
                # manually set every layer in bert model to be non-trainable
                for layer in bert_model.layers:
                    layer.trainable = False

            model_inputs.extend(bert_model.inputs)
            bert_embed = NonMaskingLayer()(bert_model.output)
            input_embed.append(
                tf.keras.layers.SpatialDropout1D(0.2)(bert_embed))

        if self.use_word:
            if self.word_embeddings is not None:
                word_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.word_vocab_size,
                    output_dim=self.word_embed_dim,
                    weights=[self.word_embeddings],
                    trainable=self.word_embed_trainable)
            else:
                word_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.word_vocab_size,
                    output_dim=self.word_embed_dim)
            input_word = tf.keras.layers.Input(shape=(self.max_len, ))
            model_inputs.append(input_word)

            word_embed = word_embedding_layer(input_word)
            input_embed.append(
                tf.keras.layers.SpatialDropout1D(self.dropout)(word_embed))

        if len(input_embed) > 1:
            input_embed = tf.keras.layers.concatenate(input_embed)
        else:
            input_embed = input_embed[0]
        return model_inputs, input_embed
Beispiel #3
0
            token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    model='albert',
    return_keras_model=False,
)

output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(units=num_classes,
               activation='softmax',
               kernel_initializer=bert.initializer)(output)

model = keras.models.Model(bert.model.input, output)
model.summary()
AdamLR = extend_with_piecewise_linear_lr(Adam)

model.compile(
    loss='sparse_categorical_crossentropy',
    # optimizer=Adam(1e-5),  # 用足够小的学习率
# Dataset generation
print(" === Dataset generation ===\n")
X_train, y_train, weights = create_dataset(train_filename)
maxlen = len(y_train[0])

#embedding_model = api.load("glove-twitter-25")

BATCH_SIZE = 64
INIT_LR = 10e-5
NB_EPOCHS = 100

# ALBERT model
print("\n === ALBERT model configuration ===\n")
bert = build_bert_model(config_path,
                        checkpoint_path,
                        with_pool=True,
                        albert=True,
                        return_keras_model=False)

output = Dropout(rate=0.1)(bert.model.output)
output_list = [
    Dense(1,
          activation='sigmoid',
          kernel_initializer=bert.initializer,
          name="Output_" + str(i + 1))(output) for i in range(0, maxlen)
]
model = Model(bert.model.input, output_list)

lossWeights = {}
losses = {}
for i in range(0, maxlen):
Beispiel #5
0
            token_ids += [tokenizer._token_sep_id]
            labels += [0]
            segment_ids = [0] * len(token_ids)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append(labels)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
)

output_layer = 'Encoder-%s-FeedForward-Norm' % bert_layers
output = model.get_layer(output_layer).output
output = Dense(num_labels)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output, mask='Sequence-Mask')

model = Model(model.input, output)
model.summary()

model.compile(loss=CRF.sparse_loss,
              optimizer=Adam(learing_rate),
              metrics=[CRF.sparse_accuracy])
Beispiel #6
0
def build_train_bert_model():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert = build_bert_model(config_path, with_mlm=True, return_keras_model=False)
    bert_model = bert.model
    proba = bert_model.output

    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id
    is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon())
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        y_true = K.cast(y_true, K.floatx())
        acc = sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon())
        return acc

    loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])

    train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc])

    # 优化器
    if which_optimizer == 'adam':
        optimizer = Adam(learning_rate=PiecewiseLinear(lr_schedule))
        learning_rate = optimizer._decayed_lr(tf.float32)
        # 添加权重衰减
        add_weight_decay_into(bert_model, weight_decay_rate * learning_rate,
                              exclude_from_weight_decay)
    else:
        optimizer = LAMB(learning_rate=PiecewiseLinear(lr_schedule),
                         weight_decay_rate=weight_decay_rate,
                         exclude_from_weight_decay=exclude_from_weight_decay)

    # 模型定型
    train_model.compile(
        loss={
            'mlm_loss': lambda y_true, y_pred: y_pred,
            'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        },
        optimizer=optimizer,
    )

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model
            qa_token_ids, qa_segment_ids = tokenizer.encode(
                answer, question, max_length=max_qa_len + 1)
            p_token_ids, p_segment_ids = tokenizer.encode(passage,
                                                          max_length=max_p_len)
            token_ids = p_token_ids + qa_token_ids[1:]
            segment_ids = p_segment_ids + qa_segment_ids[1:]
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []


bert_extract = build_bert_model(config_path, checkpoint_path,
                                model='albert')  # 建立模型,加载权重
attention_out = MultiHeadAttention(
    heads=8,
    head_size=39,
    # kernel_initializer=self.initializer,
    # max_relative_position=self.max_relative_position,
    name='attention')(
        [bert_extract.output, bert_extract.output, bert_extract.output])
extract_output = Lambda(lambda attention: attention[:, 0])(attention_out)
# model_extract = keras.models.Model(bert_extract.input, extract_output, name='model_extract')
for layer in bert_extract.layers:
    layer.name = layer.name + str("_extract")

model = build_bert_model(
    config_path,
    checkpoint_path,
            segment_ids = [0] * len(token_ids)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_a_token_ids.append(a_token_ids[1:])
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_a_token_ids = sequence_padding(batch_a_token_ids,
                                                     max_a_len)
                yield [batch_token_ids, batch_segment_ids], batch_a_token_ids
                batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
    with_mlm=True,
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
)
output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output)
model = Model(model.input, output)
model.summary()


def masked_cross_entropy(y_true, y_pred):
    """交叉熵作为loss,并mask掉padding部分的预测
    """
    y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
    y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
    cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
    return cross_entropy
                                                          max_length=max_p_len)
            token_ids = p_token_ids + qa_token_ids[1:]
            segment_ids = p_segment_ids + qa_segment_ids[1:]
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
    model=model_type,
    application='seq2seq',
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
)
model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))
Beispiel #10
0
 def _get_model(base_dir, cfg_=None):
     if "albert"in cfg["verbose"].lower():
         from bert4keras.bert import build_bert_model
         config_file = os.path.join(base_dir, 'albert_config.json')
         checkpoint_file = os.path.join(base_dir, 'model.ckpt-best')
         model = build_bert_model(
                 config_path=config_file,
                 checkpoint_path=checkpoint_file,
                 model='albert',
                 return_keras_model=True
         )
         if cfg_["cls_num"] > 1:
             output = Concatenate(axis=-1)([model.get_layer("Encoder-1-FeedForward-Norm").get_output_at(-i) for i in range(1, cfg["cls_num"] + 1)])
             model = Model(model.inputs[: 2], outputs=output)
         model.trainable = cfg_["bert_trainable"]
     elif "nezha_wwm"in cfg["verbose"].lower():
         from bert4keras.bert import build_bert_model
         config_file = os.path.join(base_dir, 'bert_config.json')
         checkpoint_file = os.path.join(base_dir, 'model.ckpt-346400')
         model = build_bert_model(
                 config_path=config_file,
                 checkpoint_path=checkpoint_file,
                 model='nezha',
                 return_keras_model=True
         )
         if bert_summary:
             model.summary()            
         if cfg_["cls_num"] > 1:
             output = Concatenate(axis=-1)([
                 model.get_layer("Encoder-{}-FeedForward-Norm".format(24 - i)).output 
                 for i in range(0, cfg["cls_num"])])
     
             model = Model(model.inputs[: 2], outputs=output)
             model = Model(model.inputs[: 2], outputs=output)
         model.trainable = cfg_["bert_trainable"]            
     elif "nezha"in cfg["verbose"].lower():
         from bert4keras.bert import build_bert_model
         config_file = os.path.join(base_dir, 'bert_config.json')
         checkpoint_file = os.path.join(base_dir, 'model.ckpt-325810')
         model = build_bert_model(
                 config_path=config_file,
                 checkpoint_path=checkpoint_file,
                 model='nezha',
                 return_keras_model=True, 
         )
         if bert_summary:
             model.summary()
 
         if cfg_["cls_num"] > 1:
             output = Concatenate(axis=-1)(
                 [model.get_layer("Encoder-{}-FeedForward-Norm".format(24 - i)).output 
                  for i in range(0, cfg["cls_num"])])
             model = Model(model.inputs[: 2], outputs=output)
         model.trainable = cfg_["bert_trainable"]             
     else:
         config_file = os.path.join(base_dir, 'bert_config.json')
         checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
         if not os.path.exists(config_file):
             config_file = os.path.join(base_dir, 'bert_config_large.json')
             checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')            
         model = load_trained_model_from_checkpoint(config_file, 
                                                    checkpoint_file, 
                                                    training=False, 
                                                    trainable=cfg_["bert_trainable"], 
                                                    output_layer_num=cfg_["cls_num"],
                                                    seq_len=cfg_['maxlen'])
         
         # model = Model(inputs=model.inputs[: 2], outputs=model.layers[-7].output)
     print(config_file, checkpoint_file)
     return model
Beispiel #11
0
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import build_bert_model
from bert4keras.utils import Tokenizer
import numpy as np


config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path) # 建立分词器
model = build_bert_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重


token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
Beispiel #12
0
#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.backend import keras
from bert4keras.bert import build_bert_model
from bert4keras.utils import Tokenizer
import numpy as np


config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path) # 建立分词器
model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []

    def forfit(self):
        while True:
            for d in self.__iter__(True):
                yield d


model = build_bert_model(
    config_path,
    checkpoint_path,
    application='seq2seq',
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    albert=args.albert,
)

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))

        batch_token_ids, batch_segment_ids = [], []
        for i in idxs:
            text = self.data[i]
            token_ids, segment_ids = tokenizer.encode(text)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []


model = build_bert_model(
    config_path,
    checkpoint_path,
    application='lm',
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
)

model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.get_layer('Sequence-Mask').output_mask[:, 1:]  # 目标mask
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))
Beispiel #15
0
    def build_input(self):

        # TODO: consider masking
        # build input for bert model
        if self.use_bert_model:
            model_inputs = []
            bert_model = build_bert_model(
                config_path=self.bert_config_file,
                checkpoint_path=self.bert_checkpoint_file)
            if not self.bert_trainable:
                # manually set every layer in bert model to be non-trainable
                for layer in bert_model.layers:
                    layer.trainable = False

            input_bert = tf.keras.layers.Input(shape=(self.max_len, ))
            input_seg = tf.keras.layers.Input(shape=(self.max_len, ))
            model_inputs.append(input_bert)
            model_inputs.append(input_seg)
            bert_embed = NonMaskingLayer()(bert_model([input_bert, input_seg]))
            input_embed = tf.keras.layers.SpatialDropout1D(
                self.dropout)(bert_embed)

            return model_inputs, input_embed

        model_inputs_a = []
        input_embed_a = []
        model_inputs_b = []
        input_embed_b = []

        if self.use_word:
            # add word input
            if self.word_embeddings is not None:
                word_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.word_vocab_size,
                    output_dim=self.word_embed_dim,
                    weights=[self.word_embeddings],
                    trainable=self.word_embed_trainable)
            else:
                word_embedding_layer = tf.keras.layers.Embedding(
                    input_dim=self.word_vocab_size,
                    output_dim=self.word_embed_dim)

            input_word_a = tf.keras.layers.Input(shape=(self.max_len, ))
            model_inputs_a.append(input_word_a)
            input_embed_a.append(
                tf.keras.layers.SpatialDropout1D(self.dropout)(
                    word_embedding_layer(input_word_a)))
            input_word_b = tf.keras.layers.Input(shape=(self.max_len, ))
            model_inputs_b.append(input_word_b)
            input_embed_b.append(
                tf.keras.layers.SpatialDropout1D(self.dropout)(
                    word_embedding_layer(input_word_b)))

            # add char input
            if self.use_char:
                if self.char_embeddings is not None:
                    char_embedding_layer = tf.keras.layers.Embedding(
                        input_dim=self.char_vocab_size,
                        output_dim=self.char_embed_dim,
                        weights=[self.char_embeddings],
                        trainable=self.char_embed_trainable)
                else:
                    char_embedding_layer = tf.keras.layers.Embedding(
                        input_dim=self.char_vocab_size,
                        output_dim=self.char_embed_dim)

                input_char_a = tf.keras.layers.Input(shape=(self.max_len,
                                                            self.max_word_len))
                model_inputs_a.append(input_char_a)
                input_char_b = tf.keras.layers.Input(shape=(self.max_len,
                                                            self.max_word_len))
                model_inputs_b.append(input_char_b)
                char_embed_a, char_embed_b = self.build_char_embedding(
                    char_embedding_layer, input_char_a, input_char_b)
                input_embed_a.append(
                    tf.keras.layers.SpatialDropout1D(
                        self.dropout)(char_embed_a))
                input_embed_b.append(
                    tf.keras.layers.SpatialDropout1D(
                        self.dropout)(char_embed_b))

        else:
            # add char input
            if self.use_char:
                if self.char_embeddings is not None:
                    char_embedding_layer = tf.keras.layers.Embedding(
                        input_dim=self.char_vocab_size,
                        output_dim=self.char_embed_dim,
                        weights=[self.char_embeddings],
                        trainable=self.char_embed_trainable)
                else:
                    char_embedding_layer = tf.keras.layers.Embedding(
                        input_dim=self.char_vocab_size,
                        output_dim=self.char_embed_dim)

                input_char_a = tf.keras.layers.Input(shape=(self.max_len, ))
                model_inputs_a.append(input_char_a)
                input_embed_a.append(
                    tf.keras.layers.SpatialDropout1D(self.dropout)(
                        char_embedding_layer(input_char_a)))
                input_char_b = tf.keras.layers.Input(shape=(self.max_len, ))
                model_inputs_b.append(input_char_b)
                input_embed_b.append(
                    tf.keras.layers.SpatialDropout1D(self.dropout)(
                        char_embedding_layer(input_char_b)))

            # add bert input
            if self.use_bert:
                bert_model = build_bert_model(
                    config_path=self.bert_config_file,
                    checkpoint_path=self.bert_checkpoint_file)
                if not self.bert_trainable:
                    # manually set every layer in bert model to be non-trainable
                    for layer in bert_model.layers:
                        layer.trainable = False

                input_bert_a = tf.keras.layers.Input(shape=(self.max_len, ))
                input_seg_a = tf.keras.layers.Input(shape=(self.max_len, ))
                model_inputs_a.append(input_bert_a)
                model_inputs_a.append(input_seg_a)
                bert_embed_a = NonMaskingLayer()(bert_model(
                    [input_bert_a, input_seg_a]))
                input_embed_a.append(
                    tf.keras.layers.SpatialDropout1D(
                        self.dropout)(bert_embed_a))

                input_bert_b = tf.keras.layers.Input(shape=(self.max_len, ))
                input_seg_b = tf.keras.layers.Input(shape=(self.max_len, ))
                model_inputs_b.append(input_bert_b)
                model_inputs_b.append(input_seg_b)
                bert_embed_b = NonMaskingLayer()(bert_model(
                    [input_bert_b, input_seg_b]))
                input_embed_b.append(
                    tf.keras.layers.SpatialDropout1D(
                        self.dropout)(bert_embed_b))

        input_embed_a = tf.keras.layers.concatenate(input_embed_a) if len(input_embed_a) > 1 \
            else input_embed_a[0]
        input_embed_b = tf.keras.layers.concatenate(input_embed_b) if len(input_embed_b) > 1 \
            else input_embed_b[0]
        return model_inputs_a + model_inputs_b, input_embed_a, input_embed_b
Beispiel #16
0
#    checkpoint_path=checkpoint_path,
#    return_keras_model=False,
#)

#bert = build_bert_model(
#    config_path=config_path,
#    checkpoint_path=checkpoint_path,
#    with_pool=True,
#    return_keras_model=False,
#)
                
##加载预训练模型:: 华为
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    model='nezha',
    with_pool=True,
    return_keras_model=False,
)                

output = Dropout(rate=0.04)(bert.model.output)
## 加了adversarial 层后,可以考虑更稳定些
#output = Lambda(lambda x: x[:, 0])(bert.model.output)

output = Dense(units=2,
               activation='softmax',
               kernel_initializer=bert.initializer)(output)

model = keras.models.Model(bert.model.input, output)
model.summary()
Beispiel #17
0
import numpy as np
from roberta.tokenizer import RobertaTokenizer
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel

roberta_dir = '/home/hadoop-aipnlp/cephfs/data/tanghongyin/workspace/roberta'
torch_roberta_dir = roberta_dir + '/roberta.base'
tf_roberta_dir = roberta_dir + '/tf_roberta_base'
config_path = tf_roberta_dir + '/bert_config.json'
checkpoint_path = tf_roberta_dir + '/tf_roberta_base.ckpt'

gpt_bpe_vocab = roberta_dir + '/gpt_bpe/encoder.json'
gpt_bpe_merge = roberta_dir + '/gpt_bpe/vocab.bpe'
roberta_dict = roberta_dir + '/roberta.base/dict.txt'

tokenizer = RobertaTokenizer(gpt_bpe_vocab, gpt_bpe_merge, roberta_dict)
model = build_bert_model(config_path, checkpoint_path, roberta=True,
                         return_all_hiddens=True)  # 建立模型,加载权重
attn_model = tf.keras.Model(inputs=model.input, outputs=[
    model.get_layer('Encoder-{}-MultiHeadSelfAttention'.format(i + 1)).output
    for i in range(12)
])

# 编码测试
text = "你好我是中文"
sep = [tokenizer.sep_token]
cls = [tokenizer.cls_token]
# 1. 先用'bpe_tokenize'将文本转换成bpe tokens
tokens = tokenizer.bpe_tokenize(text)
# 2. 然后自行添加一些标志token
tokens = cls + tokens + sep + sep + tokens + sep
# 3. 最后转换成id
token_ids = tokenizer.convert_tokens_to_ids(tokens)
Beispiel #18
0
def build_train_bert_model():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert = build_bert_model(config_path,
                            with_mlm='linear',
                            return_keras_model=False)
    bert_model = bert.model
    proba = bert_model.output

    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids')  # 目标id
    is_masked = Input(shape=(None, ), dtype='bool', name='is_masked')  # mask标记

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        loss = K.sparse_categorical_crossentropy(y_true,
                                                 y_pred,
                                                 from_logits=True)
        loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon())
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, is_masked = inputs
        is_masked = K.cast(is_masked, K.floatx())
        y_true = K.cast(y_true, K.floatx())
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon())
        return acc

    loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])

    train_model = Model(bert_model.inputs + [token_ids, is_masked],
                        [loss, acc])

    # 优化器
    OPT = extend_with_weight_decay(Adam)
    if which_optimizer == 'lamb':
        OPT = extend_with_layer_adaptation(OPT)
    OPT = extend_with_piecewise_linear_lr(OPT)
    opt_params = {
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'weight_decay_rate': weight_decay_rate,
        'exclude_from_weight_decay': exclude_from_weight_decay,
        'bias_correction': False,
    }
    if grad_accum_steps > 1:
        OPT = extend_with_gradient_accumulation(OPT)
        opt_params['grad_accum_steps'] = grad_accum_steps
    optimizer = OPT(**opt_params)

    # 模型定型
    train_model.compile(
        loss={
            'mlm_loss': lambda y_true, y_pred: y_pred,
            'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        },
        optimizer=optimizer,
    )

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids, batch_labels], None
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


c_in = Input(shape=(1, ))
c = Embedding(2, 128)(c_in)
c = Reshape((128, ))(c)

# Bert模型
model = build_bert_model(
    config_path,
    checkpoint_path,
    application='lm',
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    layer_norm_cond=c,
    additional_input_layers=c_in,
)

model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.get_layer('Sequence-Mask').output_mask[:, 1:]  # 目标mask
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))
Beispiel #20
0
    '/root/caption/coco/annotations/captions_train2014.json')
valid_data = read_caption(
    '/root/caption/coco/annotations/captions_val2014.json')

# 图像模型
MobileNetV2 = keras.applications.mobilenet_v2.MobileNetV2
preprocess_input = keras.applications.mobilenet_v2.preprocess_input
image_model = MobileNetV2(include_top=False, pooling='avg')
img_size = 299

# Bert模型
model = build_bert_model(
    config_path,
    checkpoint_path,
    application='lm',
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    layer_norm_cond=image_model.output,
    layer_norm_cond_hidden_size=128,
    layer_norm_cond_hidden_act='swish',
    additional_input_layers=image_model.input,
)

model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.get_layer('Sequence-Mask').output[:, 1:]  # 目标mask
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
Beispiel #21
0
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam

model = build_bert_model(
    config_path,
    checkpoint_path,
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    albert=True)

output = Lambda(lambda x: x[:, 0])(model.output)
output = Dense(1, activation='sigmoid')(output)
model = Model(model.input, output)

model.compile(
    loss='binary_crossentropy',
    # optimizer=Adam(1e-5),  # 用足够小的学习率
    optimizer=PiecewiseLinearLearningRate(Adam(1e-4), {
        1000: 1,
        2000: 0.1
    }),
    metrics=['accuracy'])