def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) i-i: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14... f(i-j):0 1 2 3 4 5 6 7 8 8 8 8 9 9 9 ... """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def _resource_apply(self, grad, var, indices=None): # 准备变量 var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = K.cast(self.epsilon, var_dtype) local_step = K.cast(self.iterations + 1, var_dtype) beta_1_t_power = K.pow(beta_1_t, local_step) beta_2_t_power = K.pow(beta_2_t, local_step) # 更新公式 m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad) v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * (grad - m_t)**2) # 返回算子 with tf.control_dependencies([m_t, v_t]): if self.bias_correct: m_t = m_t / (1.0 - beta_1_t_power) v_t = v_t / (1.0 - beta_2_t_power) var_t = var - lr_t * m_t / (K.sqrt(v_t) + epsilon_t) return K.update(var, var_t)
def sparse_accuracy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 计算准确率 y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32') return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))
def get_label_mask(self, y_true): """获取batch内相同label样本""" label = K.cast(y_true, 'int32') label_2 = K.reshape(label, (1, -1)) mask = K.equal(label_2, label) mask = K.cast(mask, K.floatx()) mask = mask * (1 - K.eye(K.shape(y_true)[0])) # 排除对角线,即 i == j return mask
def call(self, inputs, mask=None, a_mask=None, position_bias=None): """ 多头注意力 :param inputs: [q, k, v, a_mask, position_bias] :param mask: [q_mask, v_mask], q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding :param a_mask: Boolean,是否对attention进行mask :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移 :return: """ q, k, v = inputs[:3] q_mask, v_mask, idx = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask is not None: a_mask = inputs[idx] idx += 1 # 投影变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size]) kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size]) vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size]) # 计算attention att = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if position_bias == 'relative': position_embeddings = inputs[idx] att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings) if self.attention_scale: att = att / self.key_size**0.5 # value mask att = sequence_masking(att, v_mask, 'add', -1) # attention mask if a_mask is not None: att = att - (1 - a_mask) * 1e12 att = K.softmax(att) output = tf.einsum('bhjk,bkhd->bjhd', att, vw) # 继续处理位置编码 if position_bias == 'relative': output = output + tf.einsum('bhjk,jkd->bjhd', att, position_embeddings) output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim)) output = self.combine_dense(output) # query mask output = sequence_masking(output, q_mask, 'mul') return output
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def get_updates(self, loss, params): # 是否更新 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(shape=K.int_shape(p), dtype=K.dtype(p), name='accum_grad_{}'.format(i)) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累计更新 with K.control_dependencies(updates): acc_updates = [ K.update(ag, g + (1 - cond) * ag) for ag, g in zip(self.accum_grads, grads) ] return acc_updates
def _resource_apply(self, grad, var, indices=None): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) local_step = K.cast(self.iterations + 1, var_dtype) beta_1_power = K.pow(beta_1_t, local_step) beta_2_power = K.pow(beta_2_t, local_step) # update if indices is None: m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad) v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2) else: mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)] with tf.control_dependencies(mv_ops): m_t = self._resource_scatter_add(m, indices, (1 - beta_1_t) * grad) v_t = self._resource_scatter_add(v, indices, (1 - beta_2_t) * grad**2) # with tf.control_dependencies([m_t, v_t]): if self.bias_correct: m_t = m_t / (1 + beta_1_power) v_t = v_t / (1 + beta_2_power) var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) return K.update(var, var_t)
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) idxs_1 = idxs[None, :] idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = K.equal(idxs_1, idxs_2) labels = K.cast(labels, K.floatx()) return labels
def parse_func(serialized_record): feature_description = { 'token_ids': tf.io.FixedLenFeature([seq_length], tf.int64), 'mask_ids': tf.io.FixedLenFeature([seq_length], tf.int64) } features = tf.io.parse_single_example(serialized_record, feature_description) token_ids = features['token_ids'] mask_ids = features['mask_ids'] segment_ids = K.zeros_like(token_ids, dtype='int64') is_masked = K.not_equal(mask_ids, 0) masked_token_ids = K.switch(mask_ids, mask_ids - 1, token_ids) # 之前让位给unmask_id一位,现在减1回归 x = { 'Input-Token': masked_token_ids, 'Input-Segment': segment_ids, 'token_ids': token_ids, 'is_masked': K.cast(is_masked, K.floatx()) } y = { 'mlm_loss': K.zeros_like([1], tf.float32), 'mlm_acc': K.zeros_like([1], tf.float32) } return x, y
def call(self, inputs): # PE_2i(p) = sin(p/10000^(2i/d_pos)) # PE_2i+1(p) = cos(p/10000^(2i/d_pos)) batch_size, seq_len, word_emb_dim = K.shape(inputs)[0], K.shape( inputs)[1], K.shape(inputs)[2] if not self.embedding_dim or self.method == 'add': self.embedding_dim = word_emb_dim t = 2 * K.arange(self.embedding_dim / 2, dtype='float32') / K.cast( self.embedding_dim, dtype='float32') embedding_wise_pos = 1. / K.pow( 10000., t) # 1/10000 ^(2i/d_pos) , shape = (p_dim/2, ) embedding_wise_pos = K.expand_dims(embedding_wise_pos, 0) # (1, p_dim/2) word_wise_pos = K.cumsum(K.ones_like(inputs[:, :, 0]), axis=1) # shape = [batch_size, seq_len] word_wise_pos = K.expand_dims(word_wise_pos, 2) # (batch_size, seq_len, 1) position_embedding = K.dot( word_wise_pos, embedding_wise_pos) # (batch_size, seq_len, p_dim/2) position_embedding = K.expand_dims(position_embedding, 3) position_embedding = K.reshape(K.concatenate( [K.sin(position_embedding), K.cos(position_embedding)], axis=-1), shape=(batch_size, seq_len, -1)) if self.method == 'add': return inputs + position_embedding return K.concatenate([inputs, position_embedding], axis=-1)
def sparse_categorical_crossentropy(y_true, y_pred): # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') y_true = K.one_hot(y_true, K.shape(y_pred)[2]) # 计算交叉熵 return K.mean(K.categorical_crossentropy(y_true, y_pred))
def get_labels_of_similarity(self, inputs): idx = K.arange(0, K.shape(inputs)[0]) idx_1 = idx[None, :] idx_2 = (idx + 1 - idx % 2 * 2)[:, None] labels = K.equal(idx_1, idx_2) labels = K.cast(labels, K.floatx()) return labels
def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = K.cast(y_true, floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc
def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc
def sparse_loss(self, y_true, y_pred): """y_true需要是整数形式(非one hot) """ # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 转为one hot y_true = K.one_hot(y_true, K.shape(self.trans)[0]) return self.dense_loss(y_true, y_pred)
def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss
def call(self, x, mask=None): x0 = x x = self.k_dense(x0) x = self.o_dense(x) if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) x = x - (1 - mask) * 1e12 x = K.softmax(x, 1) x = K.sum(x0 * x, 1) return x
def compute_loss(self, inputs, mask=None): pred, ytrue = inputs acc = keras.metrics.sparse_categorical_accuracy(ytrue, pred) self.add_metric(acc, name='clf_acc') ytrue = K.cast(ytrue, 'int32') ytrue = K.one_hot(ytrue, num_classes=num_classes) ytrue = K.reshape(ytrue, (-1, num_classes)) loss = ytrue * K.log(pred + K.epsilon()) + (1 - ytrue) * K.log(1 - pred + K.epsilon()) loss = -K.mean(loss) loss = loss * self.alpha self.add_metric(loss, name='clf_loss') return loss
def call(self, x, mask=None): x0 = x if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) # x = x0 * mask if mask is not None else x0 x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0) # drop mask so do not put mask to conv1d x = self.conv1d(x0) x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:] if self.dropout_rate is not None: g = K.in_train_phase(K.dropout(g, self.dropout_rate), g) g = K.sigmoid(g) # mask is none mask = mask if mask is not None else K.ones_like(x) if self.skip_connection: if K.int_shape(x0)[-1] != self.o_dim: x0 = self.conv1d_1x1(x0) return (x0 * (1 - g) + x * g) * mask return x * g * mask
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.path_score(y_pred, y_true) # 递归计算log Z init_states = [y_pred[:, 0]] y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length) # 最后一步的log Z向量 log_norm = K.logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, inputs, **kwargs): logits, token_seq = inputs[:2] seq_shape = K.shape(token_seq) batch_size, seq_length = seq_shape[0], seq_shape[1] if self.pad_token_id is None: sequence_lengths = [seq_length - 1] * batch_size else: sequence_lengths = ( K.sum( K.cast( K.not_equal(token_seq, self.pad_token_id), dtype='int32', ), -1, keepdims=False, ) - 1 ) # only tf2 # return tf.gather(logits, sequence_lengths, batch_dims=1, axis=1) indices = K.expand_dims(sequence_lengths, -1) return tf.gather_nd(logits, indices, batch_dims=1)
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1)
def normal_shannon_entropy(p, labels_num=num_classes): # normalized entropy p = K.cast(p, K.floatx()) norm = K.log(1. / labels_num) s = K.sum(p * K.log(p), axis=-1, keepdims=True) return s / norm
def compute_classification_acc(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'), K.cast(y_true, 'int32')) return K.cast(equal, K.floatx()) / K.cast( K.shape(y_true)[0], K.floatx())
def _decayed_lr(self, var_dtypes): """重写获取decayed learning rate 方法""" lr_t = super(NewOptimzer, self)._decayed_lr(var_dtypes) lr_rate = piecewise_linear(self.iterations, self.lr_schedule) return lr_t * K.cast(lr_rate, var_dtypes)
def call(self, inputs, mask=None): # 只是计算loss,并不改变输入 if mask is not None: mask = K.cast(mask, K.floatx()) return sequence_masking(inputs, mask, 1, 1)