) output = Lambda(lambda x: x[:, 0])(bert.output) y_in = Input(shape=(None,)) # scale_output = Dense(256, kernel_initializer=bert.initializer)(output) # logits = Dense(num_classes)(output) scl_output = SupervisedContrastiveLearning(alpha=0.05, T=0.05, output_idx=0)([output, y_in]) clf_output = Dense(num_classes, activation='softmax')(output) clf_ce = CrossEntropy(output_idx=0, alpha=0.95)([clf_output, y_in]) model = Model(bert.inputs, clf_output) model.summary() train_model = Model(bert.inputs + [y_in], [scl_output, clf_ce]) train_model.compile(optimizer=Adam(lr)) if __name__ == '__main__': evaluator = Evaluator() train_model.fit_generator(train_generator.generator(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator]) # tsne from sklearn.manifold import TSNE import matplotlib.pyplot as plt f = K.function(bert.inputs, output)
teacher = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=num_hidden_layers, model='bert') # 判别模型 x_in = Input(shape=K.int_shape(teacher.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) teacher_model = Model(teacher.inputs, classifier(teacher.output)) teacher_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) teacher_model.summary() class FastbertClassifierLayer(Layer): """FastBert 中用来做分类的层,为了增加分类层的性能,同时参数不能太大,所以作者选择了一个hidden size 更小的transformer """ def __init__(self, labels_num, hidden_size=128, head_nums=2, head_size=64,
model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) model.summary() # train model o_inputs = Input(shape=(None, )) train_model = Model(model.inputs + [o_inputs], model.outputs + [o_inputs]) y_true = train_model.inputs[2][:, 1:] y_mask = train_model.inputs[1][:, 1:] y_pred = train_model.outputs[0][:, :-1] cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) train_model.add_loss(cross_entropy) train_model.compile(Adam(1e-5)) class QuestionGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) ret = model.predict([token_ids, segment_ids])[:, -1] return ret def generate(self, context, answer, topk=2, random=False):
# 加载预训练模型(3层) successor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Dense(num_labels)(x_in) CRF = ConditionalRandomField(lr_multiplier=2) x = CRF(x) classifier = Model(x_in, x) opt = Adam(learning_rate=lr) predecessor_model = Model(predecessor.inputs, classifier(predecessor.outputs)) predecessor_model.compile( loss=predecessor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) predecessor_model.summary() successor_model = Model(successor.inputs, classifier(successor.outputs)) successor_model.compile(loss=successor_model.layers[-1].layers[-1].sparse_loss, optimizer=opt, metrics=[CRF.sparse_accuracy]) successor_model.summary()
return_keras_model=False) label_inputs = Input(shape=(None, ), name='label_inputs') pooler = bert.model.outputs[0] classification_output = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) classifier = Model(bert.model.inputs, classification_output) seq2seq = Model(bert.model.inputs, bert.model.outputs[1]) outputs = TotalLoss([2])(bert.model.inputs + bert.model.outputs) # outputs = Dense(num_classes, activation='softmax')(outputs) train_model = Model(bert.model.inputs, [classification_output, outputs]) train_model.compile(loss=['sparse_categorical_crossentropy', None], optimizer=Adam(1e-5), metrics=['acc']) train_model.summary() def evaluate(val_data=valid_generator): total = 0. right = 0. for x, y_true in tqdm(val_data): y_pred = classifier.predict(x).argmax(axis=-1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() print(total, right) return right / total
# 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() # predecessor_model_3 output = predecessor_model.layers[31].output # 第3层transform output = Lambda(lambda x: x[:, 0])(output) dense = ScaleDense(lr_multiplier=5, units=num_classes, activation='softmax', weights=predecessor_model.layers[-1].get_weights()) output = dense(output) predecessor_3_model = Model(predecessor_model.inputs, output) predecessor_3_model.compile(
train_generator = data_generator(data=train_data, batch_size=batch_size) val_generator = data_generator(valid_data, batch_size) # build model bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers) output = Lambda(lambda x: x[:, 0])(bert.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr), metrics=['acc']) def evaluate(data): total, right = 0., 0. for x_true, y_true in tqdm(data): y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] total += len(y_true) right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback):
# teacher model(12层) teacher = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=True, num_hidden_layers=12, prefix='Teacher-' ) output = Lambda(lambda x: x[:, 0])(teacher.output) logits = Dense(num_classes)(output) soften = Activation(activation='softmax')(logits) teacher_logits = Model(teacher.inputs, logits) teacher_soften = Model(teacher.inputs, soften) teacher_soften.compile(loss='categorical_crossentropy', optimizer=Adam(2e-5), metrics=['acc']) teacher_soften.summary() class StudentDataGenerator(DataGenerator): """数据生成器 """ def __iter__(self, shuffle=False): batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], [] for is_end, (text, label, logits) in self.get_sample(shuffle): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(label)
(val_acc, self.best_val_acc)) # teacher model(12层) teacher = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=True, num_hidden_layers=12, prefix='Teacher-') output = Lambda(lambda x: x[:, 0])(teacher.output) logits = Dense(num_classes)(output) soften = Activation(activation='softmax')(logits) teacher_logits = Model(teacher.inputs, logits) teacher_soften = Model(teacher.inputs, soften) teacher_soften.compile(loss='categorical_crossentropy', optimizer=Adam(2e-5), metrics=['acc']) teacher_soften.summary() class StudentDataGenerator(DataGenerator): """数据生成器 """ def __iter__(self): batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], [] for is_end, (text, label, logits) in self.get_sample(): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(label)
right += (y_true == y_pred).sum() return right / total class Evaluator(keras.callbacks.Callback): def __init__(self, savename): self.best_val_acc = 0. self.savename = savename def on_epoch_end(self, epoch, logs=None): val_acc = evaluate(valid_generator, self.model) if val_acc > self.best_val_acc: self.best_val_acc = val_acc self.model.save_weights(self.savename) print( u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc) ) if __name__ == '__main__': evaluator = Evaluator('best_clf.weights') model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), metrics=['acc']) model.fit_generator(train_sim_generator.generator(), steps_per_epoch=len(train_sim_generator) * 2, epochs=5, callbacks=[evaluator] ) else: model.load_weights('best_clf.weights')
y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss # build model model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) output = CrossEntropy(2)(model.inputs + model.outputs) model = Model(model.inputs, output) model.compile(optimizer=Adam(1e-5)) model.summary() class QuestionAnswerGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.wraps('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) ret = model.predict([token_ids, segment_ids])[:, -1] return ret
if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] train_generator = data_generator(data=train_data, batch_size=batch_size) val_generator = data_generator(valid_data, batch_size) # create opt before build model opt = Adam(lr) opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt) # 开启混合精度 # build model bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers) output = Lambda(lambda x: x[:, 0])(bert.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['acc'])
y_true = K.one_hot(y_true, K.shape(y_pred)[2]) # 计算交叉熵 return K.mean(K.categorical_crossentropy(y_true, y_pred)) def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 计算准确率 y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(learing_rate), metrics=[sparse_accuracy]) def extract_answer(question, context, max_a_len=16): """抽取答案函数 """ max_q_len = 48 q_token_ids = tokenizer.encode(question, maxlen=max_q_len)[0] c_token_ids = tokenizer.encode(context, maxlen=maxlen - len(q_token_ids) + 1)[0] token_ids = q_token_ids + c_token_ids[1:] segment_ids = [0] * len(q_token_ids) + [1] * (len(c_token_ids) - 1) c_tokens = tokenizer.tokenize(context)[1:-1] mapping = tokenizer.rematch(context, c_tokens) probas = model.predict([[token_ids], [segment_ids]])[0]