def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x)
def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x)
def __init__(self, center=True, scale=True, conditional=False, hidden_units=None, hidden_activation='linear', hidden_initializer='glorot_uniform', **kwargs): super(LayerNormalization, self).__init__(**kwargs) self.center = center self.scale = scale self.conditional = conditional self.hidden_units = hidden_units self.hidden_activation = activations.get(hidden_activation) self.hidden_initializer = initializers.get(hidden_initializer) self.epsilon = K.epsilon() * K.epsilon()