p_token_ids, p_segment_ids = tokenizer.encode(passage, max_length=max_p_len) token_ids = p_token_ids + qa_token_ids[1:] segment_ids = p_segment_ids + qa_segment_ids[1:] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] model = build_bert_model( config_path, checkpoint_path, application='seq2seq', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_true = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y_pred = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
def build_input(self): """Build input placeholder and prepare embedding for ner model. Returns: Tuples of 2 tensor: 1). Input tensor(s), depending whether using multiple inputs; 2). Embedding tensor, which will be passed to following layers of ner models. """ model_inputs = [] input_embed = [] # TODO: consider masking if self.use_char: if self.char_embeddings is not None: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim, weights=[self.char_embeddings], trainable=self.char_embed_trainable) else: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim) input_char = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs.append(input_char) char_embed = char_embedding_layer(input_char) input_embed.append( tf.keras.layers.SpatialDropout1D(self.dropout)(char_embed)) if self.use_bert: bert_model = build_bert_model( config_path=self.bert_config_file, checkpoint_path=self.bert_checkpoint_file) if not self.bert_trainable: # manually set every layer in bert model to be non-trainable for layer in bert_model.layers: layer.trainable = False model_inputs.extend(bert_model.inputs) bert_embed = NonMaskingLayer()(bert_model.output) input_embed.append( tf.keras.layers.SpatialDropout1D(0.2)(bert_embed)) if self.use_word: if self.word_embeddings is not None: word_embedding_layer = tf.keras.layers.Embedding( input_dim=self.word_vocab_size, output_dim=self.word_embed_dim, weights=[self.word_embeddings], trainable=self.word_embed_trainable) else: word_embedding_layer = tf.keras.layers.Embedding( input_dim=self.word_vocab_size, output_dim=self.word_embed_dim) input_word = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs.append(input_word) word_embed = word_embedding_layer(input_word) input_embed.append( tf.keras.layers.SpatialDropout1D(self.dropout)(word_embed)) if len(input_embed) > 1: input_embed = tf.keras.layers.concatenate(input_embed) else: input_embed = input_embed[0] return model_inputs, input_embed
token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, model='albert', return_keras_model=False, ) output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) output = Dense(units=num_classes, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) model.summary() AdamLR = extend_with_piecewise_linear_lr(Adam) model.compile( loss='sparse_categorical_crossentropy', # optimizer=Adam(1e-5), # 用足够小的学习率
# Dataset generation print(" === Dataset generation ===\n") X_train, y_train, weights = create_dataset(train_filename) maxlen = len(y_train[0]) #embedding_model = api.load("glove-twitter-25") BATCH_SIZE = 64 INIT_LR = 10e-5 NB_EPOCHS = 100 # ALBERT model print("\n === ALBERT model configuration ===\n") bert = build_bert_model(config_path, checkpoint_path, with_pool=True, albert=True, return_keras_model=False) output = Dropout(rate=0.1)(bert.model.output) output_list = [ Dense(1, activation='sigmoid', kernel_initializer=bert.initializer, name="Output_" + str(i + 1))(output) for i in range(0, maxlen) ] model = Model(bert.model.input, output_list) lossWeights = {} losses = {} for i in range(0, maxlen):
token_ids += [tokenizer._token_sep_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] model = build_bert_model( config_path, checkpoint_path, ) output_layer = 'Encoder-%s-FeedForward-Norm' % bert_layers output = model.get_layer(output_layer).output output = Dense(num_labels)(output) CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier) output = CRF(output, mask='Sequence-Mask') model = Model(model.input, output) model.summary() model.compile(loss=CRF.sparse_loss, optimizer=Adam(learing_rate), metrics=[CRF.sparse_accuracy])
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm=True, return_keras_model=False) bert_model = bert.model proba = bert_model.output # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon()) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) y_true = K.cast(y_true, K.floatx()) acc = sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon()) return acc loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc]) # 优化器 if which_optimizer == 'adam': optimizer = Adam(learning_rate=PiecewiseLinear(lr_schedule)) learning_rate = optimizer._decayed_lr(tf.float32) # 添加权重衰减 add_weight_decay_into(bert_model, weight_decay_rate * learning_rate, exclude_from_weight_decay) else: optimizer = LAMB(learning_rate=PiecewiseLinear(lr_schedule), weight_decay_rate=weight_decay_rate, exclude_from_weight_decay=exclude_from_weight_decay) # 模型定型 train_model.compile( loss={ 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
qa_token_ids, qa_segment_ids = tokenizer.encode( answer, question, max_length=max_qa_len + 1) p_token_ids, p_segment_ids = tokenizer.encode(passage, max_length=max_p_len) token_ids = p_token_ids + qa_token_ids[1:] segment_ids = p_segment_ids + qa_segment_ids[1:] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] bert_extract = build_bert_model(config_path, checkpoint_path, model='albert') # 建立模型,加载权重 attention_out = MultiHeadAttention( heads=8, head_size=39, # kernel_initializer=self.initializer, # max_relative_position=self.max_relative_position, name='attention')( [bert_extract.output, bert_extract.output, bert_extract.output]) extract_output = Lambda(lambda attention: attention[:, 0])(attention_out) # model_extract = keras.models.Model(bert_extract.input, extract_output, name='model_extract') for layer in bert_extract.layers: layer.name = layer.name + str("_extract") model = build_bert_model( config_path, checkpoint_path,
segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_a_token_ids.append(a_token_ids[1:]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_a_token_ids = sequence_padding(batch_a_token_ids, max_a_len) yield [batch_token_ids, batch_segment_ids], batch_a_token_ids batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], [] model = build_bert_model( config_path, checkpoint_path, with_mlm=True, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output) model = Model(model.input, output) model.summary() def masked_cross_entropy(y_true, y_pred): """交叉熵作为loss,并mask掉padding部分的预测 """ y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) return cross_entropy
max_length=max_p_len) token_ids = p_token_ids + qa_token_ids[1:] segment_ids = p_segment_ids + qa_segment_ids[1:] batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] model = build_bert_model( config_path, checkpoint_path, model=model_type, application='seq2seq', keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
def _get_model(base_dir, cfg_=None): if "albert"in cfg["verbose"].lower(): from bert4keras.bert import build_bert_model config_file = os.path.join(base_dir, 'albert_config.json') checkpoint_file = os.path.join(base_dir, 'model.ckpt-best') model = build_bert_model( config_path=config_file, checkpoint_path=checkpoint_file, model='albert', return_keras_model=True ) if cfg_["cls_num"] > 1: output = Concatenate(axis=-1)([model.get_layer("Encoder-1-FeedForward-Norm").get_output_at(-i) for i in range(1, cfg["cls_num"] + 1)]) model = Model(model.inputs[: 2], outputs=output) model.trainable = cfg_["bert_trainable"] elif "nezha_wwm"in cfg["verbose"].lower(): from bert4keras.bert import build_bert_model config_file = os.path.join(base_dir, 'bert_config.json') checkpoint_file = os.path.join(base_dir, 'model.ckpt-346400') model = build_bert_model( config_path=config_file, checkpoint_path=checkpoint_file, model='nezha', return_keras_model=True ) if bert_summary: model.summary() if cfg_["cls_num"] > 1: output = Concatenate(axis=-1)([ model.get_layer("Encoder-{}-FeedForward-Norm".format(24 - i)).output for i in range(0, cfg["cls_num"])]) model = Model(model.inputs[: 2], outputs=output) model = Model(model.inputs[: 2], outputs=output) model.trainable = cfg_["bert_trainable"] elif "nezha"in cfg["verbose"].lower(): from bert4keras.bert import build_bert_model config_file = os.path.join(base_dir, 'bert_config.json') checkpoint_file = os.path.join(base_dir, 'model.ckpt-325810') model = build_bert_model( config_path=config_file, checkpoint_path=checkpoint_file, model='nezha', return_keras_model=True, ) if bert_summary: model.summary() if cfg_["cls_num"] > 1: output = Concatenate(axis=-1)( [model.get_layer("Encoder-{}-FeedForward-Norm".format(24 - i)).output for i in range(0, cfg["cls_num"])]) model = Model(model.inputs[: 2], outputs=output) model.trainable = cfg_["bert_trainable"] else: config_file = os.path.join(base_dir, 'bert_config.json') checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt') if not os.path.exists(config_file): config_file = os.path.join(base_dir, 'bert_config_large.json') checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model') model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=False, trainable=cfg_["bert_trainable"], output_layer_num=cfg_["cls_num"], seq_len=cfg_['maxlen']) # model = Model(inputs=model.inputs[: 2], outputs=model.layers[-7].output) print(config_file, checkpoint_file) return model
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import build_bert_model from bert4keras.utils import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = build_bert_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.backend import keras from bert4keras.bert import build_bert_model from bert4keras.utils import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166
if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] def forfit(self): while True: for d in self.__iter__(True): yield d model = build_bert_model( config_path, checkpoint_path, application='seq2seq', keep_words=keep_words, # 只保留keep_words中的字,精简原字表 albert=args.albert, ) # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
batch_token_ids, batch_segment_ids = [], [] for i in idxs: text = self.data[i] token_ids, segment_ids = tokenizer.encode(text) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) yield [batch_token_ids, batch_segment_ids], None batch_token_ids, batch_segment_ids = [], [] model = build_bert_model( config_path, checkpoint_path, application='lm', keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.get_layer('Sequence-Mask').output_mask[:, 1:] # 目标mask y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
def build_input(self): # TODO: consider masking # build input for bert model if self.use_bert_model: model_inputs = [] bert_model = build_bert_model( config_path=self.bert_config_file, checkpoint_path=self.bert_checkpoint_file) if not self.bert_trainable: # manually set every layer in bert model to be non-trainable for layer in bert_model.layers: layer.trainable = False input_bert = tf.keras.layers.Input(shape=(self.max_len, )) input_seg = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs.append(input_bert) model_inputs.append(input_seg) bert_embed = NonMaskingLayer()(bert_model([input_bert, input_seg])) input_embed = tf.keras.layers.SpatialDropout1D( self.dropout)(bert_embed) return model_inputs, input_embed model_inputs_a = [] input_embed_a = [] model_inputs_b = [] input_embed_b = [] if self.use_word: # add word input if self.word_embeddings is not None: word_embedding_layer = tf.keras.layers.Embedding( input_dim=self.word_vocab_size, output_dim=self.word_embed_dim, weights=[self.word_embeddings], trainable=self.word_embed_trainable) else: word_embedding_layer = tf.keras.layers.Embedding( input_dim=self.word_vocab_size, output_dim=self.word_embed_dim) input_word_a = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_a.append(input_word_a) input_embed_a.append( tf.keras.layers.SpatialDropout1D(self.dropout)( word_embedding_layer(input_word_a))) input_word_b = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_b.append(input_word_b) input_embed_b.append( tf.keras.layers.SpatialDropout1D(self.dropout)( word_embedding_layer(input_word_b))) # add char input if self.use_char: if self.char_embeddings is not None: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim, weights=[self.char_embeddings], trainable=self.char_embed_trainable) else: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim) input_char_a = tf.keras.layers.Input(shape=(self.max_len, self.max_word_len)) model_inputs_a.append(input_char_a) input_char_b = tf.keras.layers.Input(shape=(self.max_len, self.max_word_len)) model_inputs_b.append(input_char_b) char_embed_a, char_embed_b = self.build_char_embedding( char_embedding_layer, input_char_a, input_char_b) input_embed_a.append( tf.keras.layers.SpatialDropout1D( self.dropout)(char_embed_a)) input_embed_b.append( tf.keras.layers.SpatialDropout1D( self.dropout)(char_embed_b)) else: # add char input if self.use_char: if self.char_embeddings is not None: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim, weights=[self.char_embeddings], trainable=self.char_embed_trainable) else: char_embedding_layer = tf.keras.layers.Embedding( input_dim=self.char_vocab_size, output_dim=self.char_embed_dim) input_char_a = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_a.append(input_char_a) input_embed_a.append( tf.keras.layers.SpatialDropout1D(self.dropout)( char_embedding_layer(input_char_a))) input_char_b = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_b.append(input_char_b) input_embed_b.append( tf.keras.layers.SpatialDropout1D(self.dropout)( char_embedding_layer(input_char_b))) # add bert input if self.use_bert: bert_model = build_bert_model( config_path=self.bert_config_file, checkpoint_path=self.bert_checkpoint_file) if not self.bert_trainable: # manually set every layer in bert model to be non-trainable for layer in bert_model.layers: layer.trainable = False input_bert_a = tf.keras.layers.Input(shape=(self.max_len, )) input_seg_a = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_a.append(input_bert_a) model_inputs_a.append(input_seg_a) bert_embed_a = NonMaskingLayer()(bert_model( [input_bert_a, input_seg_a])) input_embed_a.append( tf.keras.layers.SpatialDropout1D( self.dropout)(bert_embed_a)) input_bert_b = tf.keras.layers.Input(shape=(self.max_len, )) input_seg_b = tf.keras.layers.Input(shape=(self.max_len, )) model_inputs_b.append(input_bert_b) model_inputs_b.append(input_seg_b) bert_embed_b = NonMaskingLayer()(bert_model( [input_bert_b, input_seg_b])) input_embed_b.append( tf.keras.layers.SpatialDropout1D( self.dropout)(bert_embed_b)) input_embed_a = tf.keras.layers.concatenate(input_embed_a) if len(input_embed_a) > 1 \ else input_embed_a[0] input_embed_b = tf.keras.layers.concatenate(input_embed_b) if len(input_embed_b) > 1 \ else input_embed_b[0] return model_inputs_a + model_inputs_b, input_embed_a, input_embed_b
# checkpoint_path=checkpoint_path, # return_keras_model=False, #) #bert = build_bert_model( # config_path=config_path, # checkpoint_path=checkpoint_path, # with_pool=True, # return_keras_model=False, #) ##加载预训练模型:: 华为 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, model='nezha', with_pool=True, return_keras_model=False, ) output = Dropout(rate=0.04)(bert.model.output) ## 加了adversarial 层后,可以考虑更稳定些 #output = Lambda(lambda x: x[:, 0])(bert.model.output) output = Dense(units=2, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) model.summary()
import numpy as np from roberta.tokenizer import RobertaTokenizer from fairseq.models.roberta import RobertaModel as FairseqRobertaModel roberta_dir = '/home/hadoop-aipnlp/cephfs/data/tanghongyin/workspace/roberta' torch_roberta_dir = roberta_dir + '/roberta.base' tf_roberta_dir = roberta_dir + '/tf_roberta_base' config_path = tf_roberta_dir + '/bert_config.json' checkpoint_path = tf_roberta_dir + '/tf_roberta_base.ckpt' gpt_bpe_vocab = roberta_dir + '/gpt_bpe/encoder.json' gpt_bpe_merge = roberta_dir + '/gpt_bpe/vocab.bpe' roberta_dict = roberta_dir + '/roberta.base/dict.txt' tokenizer = RobertaTokenizer(gpt_bpe_vocab, gpt_bpe_merge, roberta_dict) model = build_bert_model(config_path, checkpoint_path, roberta=True, return_all_hiddens=True) # 建立模型,加载权重 attn_model = tf.keras.Model(inputs=model.input, outputs=[ model.get_layer('Encoder-{}-MultiHeadSelfAttention'.format(i + 1)).output for i in range(12) ]) # 编码测试 text = "你好我是中文" sep = [tokenizer.sep_token] cls = [tokenizer.cls_token] # 1. 先用'bpe_tokenize'将文本转换成bpe tokens tokens = tokenizer.bpe_tokenize(text) # 2. 然后自行添加一些标志token tokens = cls + tokens + sep + sep + tokens + sep # 3. 最后转换成id token_ids = tokenizer.convert_tokens_to_ids(tokens)
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', return_keras_model=False) bert_model = bert.model proba = bert_model.output # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon()) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon()) return acc loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc]) # 优化器 OPT = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': OPT = extend_with_layer_adaptation(OPT) OPT = extend_with_piecewise_linear_lr(OPT) opt_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: OPT = extend_with_gradient_accumulation(OPT) opt_params['grad_accum_steps'] = grad_accum_steps optimizer = OPT(**opt_params) # 模型定型 train_model.compile( loss={ 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids, batch_labels], None batch_token_ids, batch_segment_ids, batch_labels = [], [], [] c_in = Input(shape=(1, )) c = Embedding(2, 128)(c_in) c = Reshape((128, ))(c) # Bert模型 model = build_bert_model( config_path, checkpoint_path, application='lm', keep_words=keep_words, # 只保留keep_words中的字,精简原字表 layer_norm_cond=c, additional_input_layers=c_in, ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.get_layer('Sequence-Mask').output_mask[:, 1:] # 目标mask y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
'/root/caption/coco/annotations/captions_train2014.json') valid_data = read_caption( '/root/caption/coco/annotations/captions_val2014.json') # 图像模型 MobileNetV2 = keras.applications.mobilenet_v2.MobileNetV2 preprocess_input = keras.applications.mobilenet_v2.preprocess_input image_model = MobileNetV2(include_top=False, pooling='avg') img_size = 299 # Bert模型 model = build_bert_model( config_path, checkpoint_path, application='lm', keep_words=keep_words, # 只保留keep_words中的字,精简原字表 layer_norm_cond=image_model.output, layer_norm_cond_hidden_size=128, layer_norm_cond_hidden_act='swish', additional_input_layers=image_model.input, ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.get_layer('Sequence-Mask').output[:, 1:] # 目标mask y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy)
if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], [] from keras.layers import * from keras.models import Model import keras.backend as K from keras.optimizers import Adam model = build_bert_model( config_path, checkpoint_path, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) model.compile( loss='binary_crossentropy', # optimizer=Adam(1e-5), # 用足够小的学习率 optimizer=PiecewiseLinearLearningRate(Adam(1e-4), { 1000: 1, 2000: 0.1 }), metrics=['accuracy'])