Example #1
0
def _AdafactorDecayRateAdam(beta2):
    """Second-moment decay rate like Adam, subsuming the correction factor.

  Args:
    beta2: a float between 0 and 1

  Returns:
    a scalar
  """
    t = _StepNum() + 1.0
    decay = beta2 * (1.0 - tf.pow(beta2, t - 1.0)) / (1.0 - tf.pow(beta2, t))
    # decay = tf.cond(tf.equal(t, 1.0), lambda: beta2, lambda: decay)
    return decay
Example #2
0
 def Value(self):
   p = self.params
   num_decays = tf.floor(
       tf.div(
           tf.cast(py_utils.GetGlobalStep(), tf.float32),
           float(p.num_steps_per_decay)))
   return tf.pow(p.decay, num_decays)
Example #3
0
 def Value(self, step=None):
   p = self.params
   num_decays = tf.floor(
       tf.div(
           tf.cast(self.GetStep(step), tf.float32),
           float(p.num_steps_per_decay)))
   return tf.pow(p.decay, num_decays)
Example #4
0
def _AdafactorDecayRatePow(exponent):
    """Second moment decay rate where memory-length grows as step_num^exponent.

  Args:
    exponent: a float between 0 and 1

  Returns:
    a scalar
  """
    return 1.0 - tf.pow((_StepNum() + 1.0), -exponent)
Example #5
0
def _AdafactorDecayRatePow(exponent, offset=0):
    """Second moment decay rate where memory-length grows as step_num^exponent.

  Args:
    exponent: a float between 0 and 1
    offset: an optional integer

  Returns:
    a scalar
  """
    return 1.0 - tf.pow((_StepNum() - offset + 1.0), -exponent)
Example #6
0
 def _generalized_inverse_pth_root(self, input_t, exponent, epsilon=1e-12):
     input_t_f64 = tf.cast(input_t, tf.float64)
     s, u, v = tf.linalg.svd(
         input_t_f64 +
         tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon,
         full_matrices=True)
     inv_s = tf.reshape(
         tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)),
         [1, -1])
     val = tf.matmul(u * inv_s, v, adjoint_b=True)
     return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
Example #7
0
 def inverse_pth_root(self, input_t, exponent, epsilon=1e-12):
   input_t_f64 = tf.cast(input_t, tf.float64)
   s, u, v = tf.linalg.svd(
       input_t_f64 +
       tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon,
       full_matrices=True)
   val = tf.matmul(
       tf.matmul(
           u,
           tf.linalg.tensor_diag(
               tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)))),
       tf.transpose(v))
   return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
Example #8
0
    def _setup_sparsity(self):
        begin_step = self._spec.sparsity_function_begin_step
        end_step = self._spec.sparsity_function_end_step
        initial_sparsity = self._spec.initial_sparsity
        target_sparsity = self._spec.target_sparsity
        exponent = self._spec.sparsity_function_exponent

        with tf.name_scope(self._spec.name):
            p = tf.minimum(
                1.0,
                tf.maximum(
                    0.0,
                    tf.div(tf.cast(self._global_step - begin_step, tf.float32),
                           end_step - begin_step)))
            sparsity = tf.add(tf.multiply(initial_sparsity - target_sparsity,
                                          tf.pow(1 - p, exponent)),
                              target_sparsity,
                              name='sparsity')

        return sparsity
Example #9
0
def inlined_matrix_inverse_pth_root(mat_g,
                                    mat_g_size,
                                    alpha,
                                    iter_count=100,
                                    error_tolerance=1e-6,
                                    ridge_epsilon=1e-6):
    """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8.

  We use an iterative Schur-Newton method from equation 3.2 on page 9 of:

  A Schur-Newton Method for the Matrix p-th Root and its Inverse
  by Chun-Hua Guo and Nicholas J. Higham
  SIAM Journal on Matrix Analysis and Applications,
  2006, Vol. 28, No. 3 : pp. 788-804
  https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf

  Args:
    mat_g: the symmetric PSD matrix whose power it to be computed
    mat_g_size: size of mat_g.
    alpha: exponent, must be -1/p for p a positive integer.
    iter_count: Maximum number of iterations.
    error_tolerance: Error indicator, useful for early termination.
    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.

  Returns:
    mat_g^alpha
  """
    alpha = tf.cast(alpha, tf.float64)
    neg_alpha = -1.0 * alpha
    exponent = 1.0 / neg_alpha
    identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64)

    def _unrolled_mat_pow_2(mat_m):
        """Computes mat_m^2."""
        return tf.matmul(mat_m, mat_m)

    def _unrolled_mat_pow_4(mat_m):
        """Computes mat_m^4."""
        mat_pow_2 = _unrolled_mat_pow_2(mat_m)
        return tf.matmul(mat_pow_2, mat_pow_2)

    def _unrolled_mat_pow_8(mat_m):
        """Computes mat_m^4."""
        mat_pow_4 = _unrolled_mat_pow_4(mat_m)
        return tf.matmul(mat_pow_4, mat_pow_4)

    def mat_power(mat_m, p):
        """Computes mat_m^p, for p == 2 or 4 or 8.

    Args:
      mat_m: a square matrix
      p: a positive integer

    Returns:
      mat_m^p
    """
        branch_index = tf.cast(p / 2 - 1, tf.int32)
        return tf.switch_case(
            branch_index, {
                0: functools.partial(_unrolled_mat_pow_2, mat_m),
                1: functools.partial(_unrolled_mat_pow_4, mat_m),
                2: functools.partial(_unrolled_mat_pow_8, mat_m),
            })

    def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error,
                        run_step):
        return tf.math.logical_and(
            tf.math.logical_and(i < iter_count, error > error_tolerance),
            run_step)

    def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step):
        mat_m_i = (1 - alpha) * identity + alpha * mat_m
        new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m)
        new_mat_h = tf.matmul(mat_h, mat_m_i)
        new_error = tf.reduce_max(tf.abs(new_mat_m - identity))
        return (i + 1, new_mat_m, new_mat_h, mat_h, new_error,
                new_error < error)

    if mat_g_size == 1:
        mat_h = tf.pow(mat_g + ridge_epsilon, alpha)
    else:
        damped_mat_g = mat_g + ridge_epsilon * identity
        z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g))
        # The best value for z is
        # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
        #                 (c_max^{1-alpha} - c_min^{1-alpha})
        # where c_max and c_min are the largest and smallest singular values of
        # damped_mat_g.
        # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha)
        # Can replace above line by the one below, but it is less accurate,
        # hence needs more iterations to converge.
        # z = (1 - 1/alpha) / tf.trace(damped_mat_g)
        # If we want the method to always converge, use z = 1 / norm(damped_mat_g)
        # or z = 1 / tf.trace(damped_mat_g), but these can result in many
        # extra iterations.
        new_mat_m_0 = damped_mat_g * z
        new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity))
        new_mat_h_0 = identity * tf.pow(z, neg_alpha)
        _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop(
            _iter_condition, _iter_body,
            [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True])
        error = tf.reduce_max(tf.abs(mat_m - identity))
        is_converged = tf.cast(convergence, old_mat_h.dtype)
        resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
    return resultant_mat_h, error
Example #10
0
 def FProp(self, theta, current_step):
   p = self.params
   num_decays = tf.floor(
       tf.div(tf.cast(current_step, tf.float32), float(p.num_steps_per_decay)))
   return tf.pow(p.decay, num_decays)
Example #11
0
 def Fn(x):
   return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)
Example #12
0
 def Value(self):
     p = self.params
     x = tf.cast(py_utils.GetGlobalStep(), dtype=p.dtype)
     return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)
Example #13
0
    tf.nn.relu,
    'RELU6':
    tf.nn.relu6,
    'LEAKY_RELU':
    tf.nn.leaky_relu,
    'SIGMOID':
    tf.sigmoid,
    'TANH':
    tf.tanh,
    'GELU':
    tf.nn.gelu,
    'GELU_APPROXIMATE':
    lambda x: tf.nn.gelu(x, approximate=True),
    'GELU_RAW':
    lambda x: 0.5 * x * (  # pylint: disable=g-long-lambda
        1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))),
    'SWISH':
    tf.nn.swish,
    'SOFTPLUS':
    tf.nn.softplus,
    # Squared ReLU from the Primer paper: https://arxiv.org/abs/2109.08668
    'SQUARED_RELU':
    lambda x: tf.math.square(tf.nn.relu(x)),
    'SILU':
    tf.nn.silu,
    'NONE':
    tf.identity,
}

_ACTIVATIONS_FLOPS = {
    'NONE': 0,
Example #14
0
 def Value(self, step=None):
   p = self.params
   x = tf.cast(self.GetStep(step), dtype=p.dtype)
   return tf.math.maximum(tf.pow(p.factor, x), p.lower_bound)