def _AdafactorDecayRateAdam(beta2): """Second-moment decay rate like Adam, subsuming the correction factor. Args: beta2: a float between 0 and 1 Returns: a scalar """ t = _StepNum() + 1.0 decay = beta2 * (1.0 - tf.pow(beta2, t - 1.0)) / (1.0 - tf.pow(beta2, t)) # decay = tf.cond(tf.equal(t, 1.0), lambda: beta2, lambda: decay) return decay
def Value(self): p = self.params num_decays = tf.floor( tf.div( tf.cast(py_utils.GetGlobalStep(), tf.float32), float(p.num_steps_per_decay))) return tf.pow(p.decay, num_decays)
def Value(self, step=None): p = self.params num_decays = tf.floor( tf.div( tf.cast(self.GetStep(step), tf.float32), float(p.num_steps_per_decay))) return tf.pow(p.decay, num_decays)
def _AdafactorDecayRatePow(exponent): """Second moment decay rate where memory-length grows as step_num^exponent. Args: exponent: a float between 0 and 1 Returns: a scalar """ return 1.0 - tf.pow((_StepNum() + 1.0), -exponent)
def _AdafactorDecayRatePow(exponent, offset=0): """Second moment decay rate where memory-length grows as step_num^exponent. Args: exponent: a float between 0 and 1 offset: an optional integer Returns: a scalar """ return 1.0 - tf.pow((_StepNum() - offset + 1.0), -exponent)
def _generalized_inverse_pth_root(self, input_t, exponent, epsilon=1e-12): input_t_f64 = tf.cast(input_t, tf.float64) s, u, v = tf.linalg.svd( input_t_f64 + tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon, full_matrices=True) inv_s = tf.reshape( tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)), [1, -1]) val = tf.matmul(u * inv_s, v, adjoint_b=True) return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
def inverse_pth_root(self, input_t, exponent, epsilon=1e-12): input_t_f64 = tf.cast(input_t, tf.float64) s, u, v = tf.linalg.svd( input_t_f64 + tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon, full_matrices=True) val = tf.matmul( tf.matmul( u, tf.linalg.tensor_diag( tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)))), tf.transpose(v)) return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
def _setup_sparsity(self): begin_step = self._spec.sparsity_function_begin_step end_step = self._spec.sparsity_function_end_step initial_sparsity = self._spec.initial_sparsity target_sparsity = self._spec.target_sparsity exponent = self._spec.sparsity_function_exponent with tf.name_scope(self._spec.name): p = tf.minimum( 1.0, tf.maximum( 0.0, tf.div(tf.cast(self._global_step - begin_step, tf.float32), end_step - begin_step))) sparsity = tf.add(tf.multiply(initial_sparsity - target_sparsity, tf.pow(1 - p, exponent)), target_sparsity, name='sparsity') return sparsity
def inlined_matrix_inverse_pth_root(mat_g, mat_g_size, alpha, iter_count=100, error_tolerance=1e-6, ridge_epsilon=1e-6): """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8. We use an iterative Schur-Newton method from equation 3.2 on page 9 of: A Schur-Newton Method for the Matrix p-th Root and its Inverse by Chun-Hua Guo and Nicholas J. Higham SIAM Journal on Matrix Analysis and Applications, 2006, Vol. 28, No. 3 : pp. 788-804 https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf Args: mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g. alpha: exponent, must be -1/p for p a positive integer. iter_count: Maximum number of iterations. error_tolerance: Error indicator, useful for early termination. ridge_epsilon: Ridge epsilon added to make the matrix positive definite. Returns: mat_g^alpha """ alpha = tf.cast(alpha, tf.float64) neg_alpha = -1.0 * alpha exponent = 1.0 / neg_alpha identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64) def _unrolled_mat_pow_2(mat_m): """Computes mat_m^2.""" return tf.matmul(mat_m, mat_m) def _unrolled_mat_pow_4(mat_m): """Computes mat_m^4.""" mat_pow_2 = _unrolled_mat_pow_2(mat_m) return tf.matmul(mat_pow_2, mat_pow_2) def _unrolled_mat_pow_8(mat_m): """Computes mat_m^4.""" mat_pow_4 = _unrolled_mat_pow_4(mat_m) return tf.matmul(mat_pow_4, mat_pow_4) def mat_power(mat_m, p): """Computes mat_m^p, for p == 2 or 4 or 8. Args: mat_m: a square matrix p: a positive integer Returns: mat_m^p """ branch_index = tf.cast(p / 2 - 1, tf.int32) return tf.switch_case( branch_index, { 0: functools.partial(_unrolled_mat_pow_2, mat_m), 1: functools.partial(_unrolled_mat_pow_4, mat_m), 2: functools.partial(_unrolled_mat_pow_8, mat_m), }) def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step): return tf.math.logical_and( tf.math.logical_and(i < iter_count, error > error_tolerance), run_step) def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step): mat_m_i = (1 - alpha) * identity + alpha * mat_m new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m) new_mat_h = tf.matmul(mat_h, mat_m_i) new_error = tf.reduce_max(tf.abs(new_mat_m - identity)) return (i + 1, new_mat_m, new_mat_h, mat_h, new_error, new_error < error) if mat_g_size == 1: mat_h = tf.pow(mat_g + ridge_epsilon, alpha) else: damped_mat_g = mat_g + ridge_epsilon * identity z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g)) # The best value for z is # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / # (c_max^{1-alpha} - c_min^{1-alpha}) # where c_max and c_min are the largest and smallest singular values of # damped_mat_g. # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) # Can replace above line by the one below, but it is less accurate, # hence needs more iterations to converge. # z = (1 - 1/alpha) / tf.trace(damped_mat_g) # If we want the method to always converge, use z = 1 / norm(damped_mat_g) # or z = 1 / tf.trace(damped_mat_g), but these can result in many # extra iterations. new_mat_m_0 = damped_mat_g * z new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity)) new_mat_h_0 = identity * tf.pow(z, neg_alpha) _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop( _iter_condition, _iter_body, [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True]) error = tf.reduce_max(tf.abs(mat_m - identity)) is_converged = tf.cast(convergence, old_mat_h.dtype) resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h return resultant_mat_h, error
def FProp(self, theta, current_step): p = self.params num_decays = tf.floor( tf.div(tf.cast(current_step, tf.float32), float(p.num_steps_per_decay))) return tf.pow(p.decay, num_decays)
def Fn(x): return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)
def Value(self): p = self.params x = tf.cast(py_utils.GetGlobalStep(), dtype=p.dtype) return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)
tf.nn.relu, 'RELU6': tf.nn.relu6, 'LEAKY_RELU': tf.nn.leaky_relu, 'SIGMOID': tf.sigmoid, 'TANH': tf.tanh, 'GELU': tf.nn.gelu, 'GELU_APPROXIMATE': lambda x: tf.nn.gelu(x, approximate=True), 'GELU_RAW': lambda x: 0.5 * x * ( # pylint: disable=g-long-lambda 1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))), 'SWISH': tf.nn.swish, 'SOFTPLUS': tf.nn.softplus, # Squared ReLU from the Primer paper: https://arxiv.org/abs/2109.08668 'SQUARED_RELU': lambda x: tf.math.square(tf.nn.relu(x)), 'SILU': tf.nn.silu, 'NONE': tf.identity, } _ACTIVATIONS_FLOPS = { 'NONE': 0,
def Value(self, step=None): p = self.params x = tf.cast(self.GetStep(step), dtype=p.dtype) return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)