def bert_of_theseus(predecessor, successor, classifier): """bert of theseus:固定住 predecessor 和 classifier,随机替换, predecessor中的block为successor对应层来训练successor """ inputs = predecessor.inputs # 固定住已经训练好的层 for layer in predecessor.model.layers: layer.trainable = False classifier.trainable = False # Embedding层替换 predecessor_outputs = predecessor.apply_embeddings(inputs) successor_outputs = successor.apply_embeddings(inputs) outputs = ProportionalAdd()([predecessor_outputs, successor_outputs]) # Transformer层替换 layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers for index in range(successor.num_hidden_layers): predecessor_outputs = outputs for sub_index in range(layers_per_module): predecessor_outputs = predecessor.apply_attention_layers( predecessor_outputs, layers_per_module * index + sub_index) successor_outputs = successor.apply_attention_layers(outputs, index) outputs = ProportionalAdd()([predecessor_outputs, successor_outputs]) # 返回模型 outputs = classifier(outputs) model = Model(inputs, outputs) return model
def bert_of_theseus(predecessor, successor, classifier): """bert of theseus """ inputs = predecessor.inputs # 固定住已经训练好的predecessor 和 classifier for layer in predecessor.model.layers: layer.trainable = False classifier.trainable = False # Embedding层替换 # 也可以固定住Embedding predecessor_outputs = predecessor.apply_embeddings(inputs) successor_outputs = successor.apply_embeddings(inputs) outputs = ProportionalAdd()([predecessor_outputs, successor_outputs]) # Transformer层替换,用successor 的 layer 替换对应predecessor的module layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers for index in range(successor.num_hidden_layers): predecessor_outputs = outputs for sub_index in range(layers_per_module): predecessor_outputs = predecessor.apply_transformer_layers( predecessor_outputs, layers_per_module * index + sub_index) successor_outputs = successor.apply_transformer_layers(outputs, index) outputs = ProportionalAdd()([predecessor_outputs, successor_outputs]) # 返回模型 outputs = classifier(outputs) model = Model(inputs, outputs) return model
return_keras_model=False, prefix='Predecessor-') # 加载预训练模型(3层) successor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Dense(num_labels)(x_in) CRF = ConditionalRandomField(lr_multiplier=2) x = CRF(x) classifier = Model(x_in, x) opt = Adam(learning_rate=lr) predecessor_model = Model(predecessor.inputs, classifier(predecessor.outputs)) predecessor_model.compile( loss=predecessor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) predecessor_model.summary() successor_model = Model(successor.inputs, classifier(successor.outputs)) successor_model.compile(loss=successor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy])
return loss bert = build_transformer_model(checkpoint_path=checkpoint_path, config_path=config_path, keep_tokens=keep_tokens, dropout_rate=0.3, ) label_inputs = Input(shape=(None,), name='label_inputs') pooler = Lambda(lambda x: x[:, 0])(bert.output) x = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) output = TotalLoss(4)(bert.inputs + [label_inputs, pooler, x]) model = Model(bert.inputs + [label_inputs], output) classifier = Model(bert.inputs, x) model.compile(optimizer=Adam(2e-5), metrics=['acc']) model.summary() def evaluate(val_data=valid_generator): total = 0. right = 0. for (x, s, y_true), _ in tqdm(val_data): y_pred = classifier.predict([x, s]).argmax(axis=-1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() print(total, right)
y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss # build model model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) model.summary() # train model o_inputs = Input(shape=(None, )) train_model = Model(model.inputs + [o_inputs], model.outputs + [o_inputs]) y_true = train_model.inputs[2][:, 1:] y_mask = train_model.inputs[1][:, 1:] y_pred = train_model.outputs[0][:, :-1] cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) train_model.add_loss(cross_entropy) train_model.compile(Adam(1e-5)) class QuestionGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs
if val_acc > self.best_val_acc: self.best_val_acc = val_acc self.model.save_weights(self.savename) print(u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc)) # 加载预训练模型(3层) bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') x = Lambda(lambda x: x[:, 0])(bert.output) x = Dense(units=num_classes, activation='softmax')(x) model = Model(bert.inputs, x) model.compile( loss='sparse_categorical_crossentropy', optimizer=AdaBelief(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) model.summary() if __name__ == '__main__': # 训练 evaluator = Evaluator('best_model.weights') model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=5, callbacks=[evaluator])
loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True, # model='bert', # 加载bert/Roberta/ernie model='nezha') target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) AdamW = extend_with_weight_decay(Adam) AdamWG = extend_with_gradient_accumulation(AdamW) opt = AdamWG(learning_rate=1e-5, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4) train_model.compile(opt) train_model.summary() label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) def predict(x): if len(x) == 3:
train_model.fit( pretrain_generator.generator(), steps_per_epoch=len(pretrain_generator), epochs=pretrain_epochs, callbacks=[checkpoint, csv_logger], ) # build task fine-tune model # reload weights without mlm # bert_without_mlm = build_transformer_model(checkpoint_path=model_saved_path, # config_path=config_path, with_mlm=False) idx = 11 feed_forward_name = 'Transformer-%d-FeedForward' % idx bert_without_mlm = bert.layers[feed_forward_name] output = Lambda(lambda x: x[:, 0])(bert_without_mlm.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(fine_tune_lr), metrics=['acc']) evaluator = Evaluator() model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=fine_tune_epochs, callbacks=[evaluator])
def build_transformer_model_with_mlm(): """带mlm的bert模型 """ bert = build_transformer_model( config_path, with_mlm='linear', # with_nsp=True, model='bert', return_keras_model=False, # keep_tokens=keep_tokens ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
K.shape(y_true)[0], K.floatx()) bert = build_transformer_model(checkpoint_path=checkpoint_path, config_path=config_path, with_pool='linear', application='unilm', keep_tokens=keep_tokens, return_keras_model=False) label_inputs = Input(shape=(None, ), name='label_inputs') pooler = bert.model.outputs[0] classification_output = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) classifier = Model(bert.model.inputs, classification_output) seq2seq = Model(bert.model.inputs, bert.model.outputs[1]) outputs = TotalLoss([2])(bert.model.inputs + bert.model.outputs) # outputs = Dense(num_classes, activation='softmax')(outputs) train_model = Model(bert.model.inputs, [classification_output, outputs]) train_model.compile(loss=['sparse_categorical_crossentropy', None], optimizer=Adam(1e-5), metrics=['acc']) train_model.summary() def evaluate(val_data=valid_generator): total = 0. right = 0.
checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 加载预训练模型(3层) successor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() successor_model = Model(successor.inputs, classifier(successor.output)) successor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], )
yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] train_generator = data_generator(data=train_data, batch_size=batch_size) val_generator = data_generator(valid_data, batch_size) # build model bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers) output = Lambda(lambda x: x[:, 0])(bert.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr), metrics=['acc']) def evaluate(data): total, right = 0., 0. for x_true, y_true in tqdm(data): y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum()
def call(self, inputs): sim_loss = self.compute_loss_of_similarity(inputs) self.add_loss(sim_loss) self.add_metric(sim_loss, 'similarity loss') return super(DenseSimLoss, self).call(inputs) # build model model = build_transformer_model( config_path, checkpoint_path, ) output = Lambda(lambda x: x[:, 0])(model.output) output = DenseSimLoss(scale=1, units=num_classes, activation='softmax')(output) model = Model(model.inputs, output) model.summary() def evaluate(data, model): total, right = 0., 0. for x_true, y_true in tqdm(data): y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback): def __init__(self, savename):
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分 y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss # build model model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) output = CrossEntropy(2)(model.inputs + model.outputs) model = Model(model.inputs, output) model.compile(optimizer=Adam(1e-5)) model.summary() class QuestionAnswerGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) ret = model.predict([token_ids, segment_ids])[:, -1] return ret
loss = K.sum(loss * y_mask) / K.sum(y_mask) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * y_mask) / K.sum(y_mask) self.add_metric(acc, name='acc') return loss model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True) target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) train_model.compile(optimizer=Adam(1e-5)) train_model.summary() def evaluate(data): label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) # print(label_ids) total, right = 0., 0. for x, _ in tqdm(data): x, y_true = x[:2], x[2] y_pred = model.predict(x)[:, mask_idx] y_pred = y_pred[:, 0, label_ids[:, 0]] * y_pred[:, 1, label_ids[:, 1]] y_pred = y_pred.argmax(axis=1) y_true = np.array( [labels.index(tokenizer.decode(y)) for y in y_true[:, mask_idx]])
if len(batch_tokens) >= self.batch_size or is_end: batch_tokens = pad_sequences(batch_tokens) batch_segs = pad_sequences(batch_segs) batch_labels = pad_sequences(batch_labels) yield [batch_tokens, batch_segs], batch_labels batch_tokens, batch_segs, batch_labels = [], [], [] model = build_transformer_model(config_path=bert_config, checkpoint_path=bert_checkpoint) output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1) output = model.get_layer(output_layer).output output = Dense(num_labes)(output) CRF = ConditionalRandomField(lr_multi) output = CRF(output) model = Model(model.input, output) model.summary() class WordSeg(ViterbiDecoder): def segment(self, data): tokens = tokenizer.tokenize(data) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(data, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segs = [0] * len(token_ids) pre = model.predict([[token_ids], [segs]])[0] labels = self.decode(pre) words = []
bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers, return_keras_model=False, ) output = Lambda(lambda x: x[:, 0])(bert.output) y_in = Input(shape=(None,)) # scale_output = Dense(256, kernel_initializer=bert.initializer)(output) # logits = Dense(num_classes)(output) scl_output = SupervisedContrastiveLearning(alpha=0.05, T=0.05, output_idx=0)([output, y_in]) clf_output = Dense(num_classes, activation='softmax')(output) clf_ce = CrossEntropy(output_idx=0, alpha=0.95)([clf_output, y_in]) model = Model(bert.inputs, clf_output) model.summary() train_model = Model(bert.inputs + [y_in], [scl_output, clf_ce]) train_model.compile(optimizer=Adam(lr)) if __name__ == '__main__': evaluator = Evaluator() train_model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator]) # tsne from sklearn.manifold import TSNE
def call(self, inputs): return super(ScaleDense, self).call(inputs) # 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() # predecessor_model_3 output = predecessor_model.layers[31].output # 第3层transform output = Lambda(lambda x: x[:, 0])(output) dense = ScaleDense(lr_multiplier=5, units=num_classes, activation='softmax',
def build_transformer_model_with_mlm(version='pre'): """带mlm的bert模型 """ assert version in ['pre', 'post', 'rezero'] if version == 'rezero': attention_name = 'Transformer-%d-MultiHeadSelfAttention' feed_forward_name = 'Transformer-%d-FeedForward' skip_weights = [] for i in range(12): skip_weights.append(feed_forward_name % i + '-Norm') skip_weights.append(feed_forward_name % i + '-ReWeight') skip_weights.append(attention_name % i + '-Norm') skip_weights.append(attention_name % i + '-ReWeight') bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, skip_weights_from_checkpoints=skip_weights, use_layernorm=None, reweight_trainable=True, init_reweight=0., ) else: bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, # skip_weights_from_checkpoints=skip_weights, use_layernorm=version, reweight_trainable=False, init_reweight=1., ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
output = DGCNN(dilation_rate=5, dropout_rate=0.1)(output) output = DGCNN(dilation_rate=2, dropout_rate=0.1)(output) output = DGCNN(dilation_rate=1, dropout_rate=0.1)(output) output = SinCosPositionEmbedding(K.int_shape(output)[-1])(output) att = AttentionPooling1D()(output) output = ConcatSeq2Vec()([output, att]) # att = K.expand_dims(att, 1) # output = Add()([output, att]) output = Dropout(0.3)(output) output = Dense(2)(output) output = MaskedSoftmax()(output) output = Permute((2, 1), name='permute')(output) model = Model(inputs, output) model.summary() def sparse_categorical_crossentropy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') y_true = K.one_hot(y_true, K.shape(y_pred)[2]) # 计算交叉熵 return K.mean(K.categorical_crossentropy(y_true, y_pred)) def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1])