def call(self, inputs, mask=None, **kwargs): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 """ q, k, v = inputs[:3] q_mask, v_mask = None, None if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention qkv_inputs = [qw, kw, vw] + inputs[3:] qv_masks = [q_mask, v_mask] o = self.pay_attention_to(qkv_inputs, qv_masks, **kwargs) # 完成输出 o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def basic_accuracy(self, y_true, y_pred, go_backwards=False): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算逐标签accuracy histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: seq_len = K.shape(inputs)[1] inputs, position_ids = inputs if 'float' not in K.dtype(position_ids): position_ids = K.cast(position_ids, K.floatx()) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] position_ids = K.arange(0, seq_len, dtype=K.floatx())[None] indices = K.arange(0, self.output_dim // 2, dtype=K.floatx()) indices = K.pow(10000.0, -2 * indices / self.output_dim) embeddings = tf.einsum('bn,d->bnd', position_ids, indices) embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1) embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim)) if self.merge_mode == 'add': return inputs + embeddings elif self.merge_mode == 'mul': return inputs * (embeddings + 1.0) elif self.merge_mode == 'zero': return embeddings else: if not self.custom_position_ids: embeddings = K.tile(embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, embeddings])
def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon()) return acc
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def parse_function(serialized): features = { 'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64), 'mask_ids': tf.io.FixedLenFeature([sequence_length], tf.int64), } features = tf.io.parse_single_example(serialized, features) token_ids = features['token_ids'] mask_ids = features['mask_ids'] segment_ids = K.zeros_like(token_ids, dtype='int64') is_masked = K.not_equal(mask_ids, 0) masked_token_ids = K.switch(is_masked, mask_ids - 1, token_ids) """ Input-Token:直接输入到Bert模型的 Input-Segment:输入到Bert模型的 """ x = { 'Input-Token': masked_token_ids, 'Input-Segment': segment_ids, 'token_ids': token_ids, 'is_masked': K.cast(is_masked, K.floatx()), } y = { 'mlm_loss': K.zeros([1]), 'mlm_acc': K.zeros([1]), } return x, y
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) idxs_1 = idxs[None, :] idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = K.equal(idxs_1, idxs_2) labels = K.cast(labels, K.floatx()) return labels
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 获取梯度 grads = super(NewOptimizer, self).get_gradients(loss, params) accum_grads = [self.accum_grads[p] for p in params] # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, accum_grads) ] return accum_updates
def basic_loss(self, y_true, y_pred, go_backwards=False): """y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算loss histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 loss = K.sparse_categorical_crossentropy( y_true, y_pred, from_logits=True ) return K.sum(loss * mask) / K.sum(mask)
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros( K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i ) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 计算准确率 y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) 对所有模型使用 32 个嵌入,其数值范围的大小以对数方式增加,最大偏移量为128,超过此偏移量,所有相对位置使用同一嵌入。 需要注意的是,某一给定层对超过 128 的相对位置不敏感,但是后续层可以通过组合来自先前层的局部信息来建立对更大偏移的敏感性。 """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def masked_cross_entropy(self, y_true, y_pred): y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) return cross_entropy
def globalpointer_crossentropy(y_true, y_pred): """给GlobalPointer设计的交叉熵 """ shape = K.shape(y_pred) y_true = y_true[..., 0] * K.cast(shape[2], K.floatx()) + y_true[..., 1] y_pred = K.reshape(y_pred, (shape[0], -1, K.prod(shape[2:]))) loss = sparse_multilabel_categorical_crossentropy(y_true, y_pred, True) return K.mean(K.sum(loss, axis=1))
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs y_mask = K.cast(mask[1], K.floatx())[:, 1:] y_true = y_true[:, 1:] # 目标token_ids y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def masked_cross_entropy(y_true, y_pred): """交叉熵作为loss,并mask掉padding部分的预测 """ y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) return cross_entropy
def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon()) return loss
def call(self, inputs, mask=None): axis = 1 if self.data_format == 'channels_last' else 2 if mask is not None: mask = K.cast(mask, K.floatx()) mask = mask[..., None] if axis == 1 else mask[:, None] return K.sum(inputs * mask, axis=axis) / K.sum(mask, axis=axis) else: return K.mean(inputs, axis=axis)
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) # value=[0, ..., batch-1] idxs_1 = idxs[None, :] # shape=(1, batch) idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] # shape=(batch, 1) labels = K.equal(idxs_1, idxs_2) # eg: batch=2 [[False, True], [True, False]] labels = K.cast(labels, K.floatx()) return labels
def masked_crossentropy(y_true, y_pred): """mask掉非预测部分 """ y_true = K.reshape(y_true, K.shape(y_true)[:2]) y_mask = K.cast(K.greater(y_true, 0.5), K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss[None, None]
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def build_model(self): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc. if self.memory_fraction: config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction config.gpu_options.allow_growth = False else: config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2, ), name='Subject-Ids') object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=self.bert_config_path, checkpoint_path=self.bert_checkpoint_path, return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x**2)(output) self.subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(self.extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=self.num_classes * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape((-1, self.num_classes, 2))(output) self.object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 self.model = Model( bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) self.model.add_loss(subject_loss + object_loss) AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') self.optimizer = AdamEMA(lr=1e-4)
def compute_copy_loss(self, inputs, mask=None): _, y_mask, y_true, _, y_pred = inputs y_mask = K.cumsum(y_mask[:, ::-1], axis=1)[:, ::-1] y_mask = K.cast(K.greater(y_mask, 0.5), K.floatx()) y_mask = y_mask[:, 1:] # mask标记,减少一位 y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs # y_true:[batch_size, sequence_length]。应该是one-hot的表示,有一个地方为1,其他地方为0:[0,0,1,...0] y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) # y_mask是一个和y_true一致的shape. 1的值还为1.0,0的值还为0.0.即[0.0,0.0,1.0,...0.0]。 # sparse_categorical_accuracy的例子。y_true = 2; y_pred = (0.02, 0.05, 0.83, 0.1); acc = sparse_categorical_accuracy(y_true, y_pred) accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 if self.input_mask is None: mask = None else: mask = K.cast(self.input_mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) if mask is None: return K.mean(isequal) else: return K.sum(isequal * mask) / K.sum(mask)
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs # y_true:(btz, seq_len) y_pred:(btz, seq_len, 13584) if mask[1] is None: y_mask = 1.0 else: y_mask = K.cast(mask[1], K.floatx())[:, 1:] # 去掉头部的 CLS 因为预测的时候没用到它 y_true = y_true[:, 1:] # 目标token_ids,从第二个字开始预测,所以取第二个字为开头 y_pred = y_pred[:, :-1] # 预测序列,错开一位 (btz, seq_len, 13584) loss = K.sparse_categorical_crossentropy(y_true, y_pred) # (btz, seq_len) sparse_categorical_crossentropy的用法是不需要自己转成one_hot形式 loss = K.sum(loss * y_mask) / K.sum(y_mask) # 算每个btz中loss的平均值,mask掉padding部分,也就是把padding的loss去掉 return loss
def lm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = K.cast(y_true, K.floatx()) y_true = y_true[:, 1:] y_pred = y_pred[:, :-1] mask = mask[:, 1:] acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs # y_true=(8, 10, 21128);y_pred=(8, 10, 21128) print("CrossEntropy.y_true.shape:", y_true.shape) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) # 这个是找到1所在的位置,然后计算损失,忽略其他位置上的损失。 accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss