def call(self, inputs, mask=None, a_mask=None, position_bias=None): """ 多头注意力 :param inputs: [q, k, v, a_mask, position_bias] :param mask: [q_mask, v_mask], q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding :param a_mask: Boolean,是否对attention进行mask :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移 :return: """ q, k, v = inputs[:3] q_mask, v_mask, idx = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask is not None: a_mask = inputs[idx] idx += 1 # 投影变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size]) kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size]) vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size]) # 计算attention att = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if position_bias == 'relative': position_embeddings = inputs[idx] att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings) if self.attention_scale: att = att / self.key_size**0.5 # value mask att = sequence_masking(att, v_mask, 'add', -1) # attention mask if a_mask is not None: att = att - (1 - a_mask) * 1e12 att = K.softmax(att) output = tf.einsum('bhjk,bkhd->bjhd', att, vw) # 继续处理位置编码 if position_bias == 'relative': output = output + tf.einsum('bhjk,jkd->bjhd', att, position_embeddings) output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim)) output = self.combine_dense(output) # query mask output = sequence_masking(output, q_mask, 'mul') return output
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def get_labels_of_similarity(self, inputs): idx = K.arange(0, K.shape(inputs)[0]) idx_1 = idx[None, :] idx_2 = (idx + 1 - idx % 2 * 2)[:, None] labels = K.equal(idx_1, idx_2) labels = K.cast(labels, K.floatx()) return labels
def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 计算准确率 y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))
def get_updates(self, loss, params): # 是否更新 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(shape=K.int_shape(p), dtype=K.dtype(p), name='accum_grad_{}'.format(i)) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累计更新 with K.control_dependencies(updates): acc_updates = [ K.update(ag, g + (1 - cond) * ag) for ag, g in zip(self.accum_grads, grads) ] return acc_updates
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) idxs_1 = idxs[None, :] idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = K.equal(idxs_1, idxs_2) labels = K.cast(labels, K.floatx()) return labels
def parse_func(serialized_record): feature_description = { 'token_ids': tf.io.FixedLenFeature([seq_length], tf.int64), 'mask_ids': tf.io.FixedLenFeature([seq_length], tf.int64) } features = tf.io.parse_single_example(serialized_record, feature_description) token_ids = features['token_ids'] mask_ids = features['mask_ids'] segment_ids = K.zeros_like(token_ids, dtype='int64') is_masked = K.not_equal(mask_ids, 0) masked_token_ids = K.switch(mask_ids, mask_ids - 1, token_ids) # 之前让位给unmask_id一位,现在减1回归 x = { 'Input-Token': masked_token_ids, 'Input-Segment': segment_ids, 'token_ids': token_ids, 'is_masked': K.cast(is_masked, K.floatx()) } y = { 'mlm_loss': K.zeros_like([1], tf.float32), 'mlm_acc': K.zeros_like([1], tf.float32) } return x, y
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) i-i: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14... f(i-j):0 1 2 3 4 5 6 7 8 8 8 8 9 9 9 ... """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def get_label_mask(self, y_true): """获取batch内相同label样本""" label = K.cast(y_true, 'int32') label_2 = K.reshape(label, (1, -1)) mask = K.equal(label_2, label) mask = K.cast(mask, K.floatx()) mask = mask * (1 - K.eye(K.shape(y_true)[0])) # 排除对角线,即 i == j return mask
def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def call(self, x, mask=None): x0 = x x = self.k_dense(x0) x = self.o_dense(x) if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) x = x - (1 - mask) * 1e12 x = K.softmax(x, 1) x = K.sum(x0 * x, 1) return x
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, x, mask=None): x0 = x if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) # x = x0 * mask if mask is not None else x0 x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0) # drop mask so do not put mask to conv1d x = self.conv1d(x0) x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:] if self.dropout_rate is not None: g = K.in_train_phase(K.dropout(g, self.dropout_rate), g) g = K.sigmoid(g) # mask is none mask = mask if mask is not None else K.ones_like(x) if self.skip_connection: if K.int_shape(x0)[-1] != self.o_dim: x0 = self.conv1d_1x1(x0) return (x0 * (1 - g) + x * g) * mask return x * g * mask
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.path_score(y_pred, y_true) # 递归计算log Z init_states = [y_pred[:, 0]] y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length) # 最后一步的log Z向量 log_norm = K.logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def build_transformer_model_with_mlm(): """带mlm的bert模型 """ bert = build_transformer_model( config_path, with_mlm='linear', # with_nsp=True, model='bert', return_keras_model=False, # keep_tokens=keep_tokens ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
def build_transformer_model_with_mlm(version='pre'): """带mlm的bert模型 """ assert version in ['pre', 'post', 'rezero'] if version == 'rezero': attention_name = 'Transformer-%d-MultiHeadSelfAttention' feed_forward_name = 'Transformer-%d-FeedForward' skip_weights = [] for i in range(12): skip_weights.append(feed_forward_name % i + '-Norm') skip_weights.append(feed_forward_name % i + '-ReWeight') skip_weights.append(attention_name % i + '-Norm') skip_weights.append(attention_name % i + '-ReWeight') bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, skip_weights_from_checkpoints=skip_weights, use_layernorm=None, reweight_trainable=True, init_reweight=0., ) else: bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, # skip_weights_from_checkpoints=skip_weights, use_layernorm=version, reweight_trainable=False, init_reweight=1., ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
def normal_shannon_entropy(p, labels_num=num_classes): # normalized entropy p = K.cast(p, K.floatx()) norm = K.log(1. / labels_num) s = K.sum(p * K.log(p), axis=-1, keepdims=True) return s / norm
import os os.environ['TF_KERAS'] = '1' # 必须使用tf.keras import glob import tensorflow as tf from toolkit4nlp.backend import K, keras from toolkit4nlp.models import build_transformer_model from toolkit4nlp.optimizers import Adam, extend_with_gradient_accumulation, extend_with_wight_decay from toolkit4nlp.optimizers import extend_with_piecewise_linear_lr from keras.models import Model from toolkit4nlp.layers import Input, Lambda from preprocess import TrainingDataSetRoBERTa floatx = K.floatx() config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' model_save_path = '../saved_model/bert_model.ckpt' file_names = glob.glob('../corpus_record/*') seq_length = 512 batch_size = 8 learning_rate = 0.00176 weight_decay_rate = 0.01 num_warmup_steps = 3125 num_train_steps = 125000
def compute_classification_acc(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'), K.cast(y_true, 'int32')) return K.cast(equal, K.floatx()) / K.cast( K.shape(y_true)[0], K.floatx())
def call(self, inputs, mask=None): # 只是计算loss,并不改变输入 if mask is not None: mask = K.cast(mask, K.floatx()) return sequence_masking(inputs, mask, 1, 1)
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1)