) output = Lambda(lambda x: x[:, 0])(bert.output) y_in = Input(shape=(None,)) # scale_output = Dense(256, kernel_initializer=bert.initializer)(output) # logits = Dense(num_classes)(output) scl_output = SupervisedContrastiveLearning(alpha=0.05, T=0.05, output_idx=0)([output, y_in]) clf_output = Dense(num_classes, activation='softmax')(output) clf_ce = CrossEntropy(output_idx=0, alpha=0.95)([clf_output, y_in]) model = Model(bert.inputs, clf_output) model.summary() train_model = Model(bert.inputs + [y_in], [scl_output, clf_ce]) train_model.compile(optimizer=Adam(lr)) if __name__ == '__main__': evaluator = Evaluator() train_model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator]) # tsne from sklearn.manifold import TSNE import matplotlib.pyplot as plt f = K.function(bert.inputs, output)
train_model.fit( pretrain_generator.generator(), steps_per_epoch=len(pretrain_generator), epochs=pretrain_epochs, callbacks=[checkpoint, csv_logger], ) # build task fine-tune model # reload weights without mlm # bert_without_mlm = build_transformer_model(checkpoint_path=model_saved_path, # config_path=config_path, with_mlm=False) idx = 11 feed_forward_name = 'Transformer-%d-FeedForward' % idx bert_without_mlm = bert.layers[feed_forward_name] output = Lambda(lambda x: x[:, 0])(bert_without_mlm.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(fine_tune_lr), metrics=['acc']) evaluator = Evaluator() model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=fine_tune_epochs, callbacks=[evaluator])
return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Dense(num_labels)(x_in) CRF = ConditionalRandomField(lr_multiplier=2) x = CRF(x) classifier = Model(x_in, x) opt = Adam(learning_rate=lr) predecessor_model = Model(predecessor.inputs, classifier(predecessor.outputs)) predecessor_model.compile( loss=predecessor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) predecessor_model.summary() successor_model = Model(successor.inputs, classifier(successor.outputs)) successor_model.compile(loss=successor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) successor_model.summary() theseus_model = bert_of_theseus(predecessor, successor, classifier) theseus_model.compile(loss=theseus_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) theseus_model.summary()
trans = K.eval(CRF.trans) wordseg.trans = trans print(trans) acc = evaluate(val_data) if acc > self.best_acc: self.best_acc = acc model.save_weights('./best_model.weights') print('acc is: {:.3f}, best acc is :{:.4f}'.format(acc, self.best_acc)) def on_train_end(self, logs=None): model.load_weights('./best_model.weights') public_evaluate(test_path, test_result_path, test_score_path) opt = extend_with_gradient_accumulation(Adam) opt = opt(learning_rate=lr) model.compile(loss=CRF.sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) if __name__ == '__main__': evaluator = Evaluator() train_genarator = data_generator(train_data, batch_size) model.fit_generator(train_genarator.generator(), steps_per_epoch=len(train_genarator), epochs=epochs, callbacks=[evaluator]) else: model.load_weights('./best_model.weights')
model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) model.summary() # train model o_inputs = Input(shape=(None, )) train_model = Model(model.inputs + [o_inputs], model.outputs + [o_inputs]) y_true = train_model.inputs[2][:, 1:] y_mask = train_model.inputs[1][:, 1:] y_pred = train_model.outputs[0][:, :-1] cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) train_model.add_loss(cross_entropy) train_model.compile(Adam(1e-5)) class QuestionGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) ret = model.predict([token_ids, segment_ids])[:, -1] return ret def generate(self, context, answer, topk=2, random=False):
bert = build_transformer_model(checkpoint_path=checkpoint_path, config_path=config_path, keep_tokens=keep_tokens, dropout_rate=0.3, ) label_inputs = Input(shape=(None,), name='label_inputs') pooler = Lambda(lambda x: x[:, 0])(bert.output) x = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) output = TotalLoss(4)(bert.inputs + [label_inputs, pooler, x]) model = Model(bert.inputs + [label_inputs], output) classifier = Model(bert.inputs, x) model.compile(optimizer=Adam(2e-5), metrics=['acc']) model.summary() def evaluate(val_data=valid_generator): total = 0. right = 0. for (x, s, y_true), _ in tqdm(val_data): y_pred = classifier.predict([x, s]).argmax(axis=-1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() print(total, right) return right / total
y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) # optimizer optimizer = extend_with_weight_decay(Adam) optimizer = extend_with_gradient_accumulation(optimizer) params = { 'learning_rate': learning_rate, 'weight_decay_rate': 1e-5, 'exclude_from_weight_decay': ['norm', 'bias'], 'grad_accum_steps': 4 } optimizer = optimizer(**params) model.compile(loss=sparse_categorical_crossentropy, optimizer=optimizer, metrics=[sparse_accuracy]) def extract_answer(question, context, max_a_len=32): """抽取答案函数 """ max_q_len = 64 q_token_ids = tokenizer.encode(question, maxlen=max_q_len)[0] c_token_ids = tokenizer.encode(context, maxlen=maxlen - len(q_token_ids) + 1)[0] token_ids = q_token_ids + c_token_ids[1:] segment_ids = [0] * len(q_token_ids) + [1] * (len(c_token_ids) - 1) c_tokens = tokenizer.tokenize(context)[1:-1] mapping = tokenizer.rematch(context, c_tokens) token_ids = np.array([token_ids]) # tf2.X 必须要转np.array
print(u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc)) # 加载预训练模型(3层) bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') x = Lambda(lambda x: x[:, 0])(bert.output) x = Dense(units=num_classes, activation='softmax')(x) model = Model(bert.inputs, x) model.compile( loss='sparse_categorical_crossentropy', optimizer=AdaBelief(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) model.summary() if __name__ == '__main__': # 训练 evaluator = Evaluator('best_model.weights') model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=5, callbacks=[evaluator]) else: model.load_weights('best_model.weights')
return_keras_model=False) label_inputs = Input(shape=(None, ), name='label_inputs') pooler = bert.model.outputs[0] classification_output = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) classifier = Model(bert.model.inputs, classification_output) seq2seq = Model(bert.model.inputs, bert.model.outputs[1]) outputs = TotalLoss([2])(bert.model.inputs + bert.model.outputs) # outputs = Dense(num_classes, activation='softmax')(outputs) train_model = Model(bert.model.inputs, [classification_output, outputs]) train_model.compile(loss=['sparse_categorical_crossentropy', None], optimizer=Adam(1e-5), metrics=['acc']) train_model.summary() def evaluate(val_data=valid_generator): total = 0. right = 0. for x, y_true in tqdm(val_data): y_pred = classifier.predict(x).argmax(axis=-1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() print(total, right) return right / total
# 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() # predecessor_model_3 output = predecessor_model.layers[31].output # 第3层transform output = Lambda(lambda x: x[:, 0])(output) dense = ScaleDense(lr_multiplier=5, units=num_classes, activation='softmax', weights=predecessor_model.layers[-1].get_weights()) output = dense(output) predecessor_3_model = Model(predecessor_model.inputs, output) predecessor_3_model.compile(
y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss # build model model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) output = CrossEntropy(2)(model.inputs + model.outputs) model = Model(model.inputs, output) model.compile(optimizer=Adam(1e-5)) model.summary() class QuestionAnswerGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) ret = model.predict([token_ids, segment_ids])[:, -1] return ret
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * y_mask) / K.sum(y_mask) self.add_metric(acc, name='acc') return loss model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True) target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) train_model.compile(optimizer=Adam(1e-5)) train_model.summary() def evaluate(data): label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) # print(label_ids) total, right = 0., 0. for x, _ in tqdm(data): x, y_true = x[:2], x[2] y_pred = model.predict(x)[:, mask_idx] y_pred = y_pred[:, 0, label_ids[:, 0]] * y_pred[:, 1, label_ids[:, 1]] y_pred = y_pred.argmax(axis=1) y_true = np.array( [labels.index(tokenizer.decode(y)) for y in y_true[:, mask_idx]])
with_mlm=True, # model='bert', # 加载bert/Roberta/ernie model='nezha') target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) AdamW = extend_with_weight_decay(Adam) AdamWG = extend_with_gradient_accumulation(AdamW) opt = AdamWG(learning_rate=1e-5, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4) train_model.compile(opt) train_model.summary() label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) def predict(x): if len(x) == 3: x = x[:2] y_pred = model.predict(x)[:, mask_idx] y_pred = y_pred[:, 0, label_ids[:, 0]] y_pred = y_pred.argmax(axis=1) return y_pred def evaluate(data):
# create opt before build model opt = Adam(lr) opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt) # 开启混合精度 # build model bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers) output = Lambda(lambda x: x[:, 0])(bert.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['acc']) def evaluate(data): total, right = 0., 0. for x_true, y_true in tqdm(data): y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback):
y_true = K.one_hot(y_true, K.shape(y_pred)[2]) # 计算交叉熵 return K.mean(K.categorical_crossentropy(y_true, y_pred)) def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 计算准确率 y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(learing_rate), metrics=[sparse_accuracy]) def extract_answer(question, context, max_a_len=16): """抽取答案函数 """ max_q_len = 48 q_token_ids = tokenizer.encode(question, maxlen=max_q_len)[0] c_token_ids = tokenizer.encode(context, maxlen=maxlen - len(q_token_ids) + 1)[0] token_ids = q_token_ids + c_token_ids[1:] segment_ids = [0] * len(q_token_ids) + [1] * (len(c_token_ids) - 1) c_tokens = tokenizer.tokenize(context)[1:-1] mapping = tokenizer.rematch(context, c_tokens) probas = model.predict([[token_ids], [segment_ids]])[0]