def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0.0, K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0), 1.0) new_x = x + dx * ratio return old_update(x, new_x)
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0.0, K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0), 1.0) new_x = x + dx * ratio return old_update(x, new_x)