Ejemplo n.º 1
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      completed_fraction = global_step_recomp / first_decay_steps

      def compute_step(completed_fraction, geometric=False):
        """Helper for `cond` operation."""
        if geometric:
          i_restart = math_ops.floor(
              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
              math_ops.log(t_mul))

          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart

        else:
          i_restart = math_ops.floor(completed_fraction)
          completed_fraction -= i_restart

        return i_restart, completed_fraction

      i_restart, completed_fraction = control_flow_ops.cond(
          math_ops.equal(t_mul, 1.0),
          lambda: compute_step(completed_fraction, geometric=False),
          lambda: compute_step(completed_fraction, geometric=True))

      m_fac = m_mul**i_restart
      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))
      decayed = (1 - alpha) * cosine_decayed + alpha

      return math_ops.multiply(learning_rate, decayed, name=name)
Ejemplo n.º 2
0
def _SinGrad(op, grad):
  """Returns grad * cos(x)."""
  x = op.inputs[0]
  with ops.control_dependencies([grad.op]):
    if x.dtype.is_complex:
      x = math_ops.conj(x)
    return grad * math_ops.cos(x)
  def __call__(self, step):
    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
                        [self.initial_learning_rate, step]) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)
      initial_variance = math_ops.cast(self.initial_variance, dtype)
      variance_decay = math_ops.cast(self.variance_decay, dtype)
      num_periods = math_ops.cast(self.num_periods, dtype)
      alpha = math_ops.cast(self.alpha, dtype)
      beta = math_ops.cast(self.beta, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
Ejemplo n.º 4
0
def _TanGrad(op, grad):
  """Returns grad * 1/sec^2(x)."""
  x = op.inputs[0]
  with ops.control_dependencies([grad.op]):
    secx = math_ops.inv(math_ops.cos(x))
    secx2 = math_ops.square(secx)
    return grad * secx2
Ejemplo n.º 5
0
  def decayed_lr(learning_rate, global_step, decay_steps, initial_variance,
                 variance_decay, num_periods, alpha, beta, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "NoisyLinearCosineDecay",
                        [learning_rate, global_step]) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      decay_steps = math_ops.cast(decay_steps, dtype)
      initial_variance = math_ops.cast(initial_variance, dtype)
      variance_decay = math_ops.cast(variance_decay, dtype)
      num_periods = math_ops.cast(num_periods, dtype)
      alpha = math_ops.cast(alpha, dtype)
      beta = math_ops.cast(beta, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          learning_rate, noisy_linear_cosine_decayed, name=name)
Ejemplo n.º 6
0
def _raised_cosine_window(name, default_name, window_length, periodic,
                          dtype, a, b):
  """Helper function for computing a raised cosine window.

  Args:
    name: Name to use for the scope.
    default_name: Default name to use for the scope.
    window_length: A scalar `Tensor` or integer indicating the window length.
    periodic: A bool `Tensor` indicating whether to generate a periodic or
      symmetric window.
    dtype: A floating point `DType`.
    a: The alpha parameter to the raised cosine window.
    b: The beta parameter to the raised cosine window.

  Returns:
    A `Tensor` of shape `[window_length]` of type `dtype`.

  Raises:
    ValueError: If `dtype` is not a floating point type or `window_length` is
      not scalar or `periodic` is not scalar.
  """
  if not dtype.is_floating:
    raise ValueError('dtype must be a floating point type. Found %s' % dtype)

  with ops.name_scope(name, default_name, [window_length, periodic]):
    window_length = ops.convert_to_tensor(window_length, dtype=dtypes.int32,
                                          name='window_length')
    window_length.shape.assert_has_rank(0)
    window_length_const = tensor_util.constant_value(window_length)
    if window_length_const == 1:
      return array_ops.ones([1], dtype=dtype)
    periodic = math_ops.cast(
        ops.convert_to_tensor(periodic, dtype=dtypes.bool, name='periodic'),
        dtypes.int32)
    periodic.shape.assert_has_rank(0)
    even = 1 - math_ops.mod(window_length, 2)

    n = math_ops.cast(window_length + periodic * even - 1, dtype=dtype)
    count = math_ops.cast(math_ops.range(window_length), dtype)
    cos_arg = constant_op.constant(2 * np.pi, dtype=dtype) * count / n

    if window_length_const is not None:
      return math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype)
    return control_flow_ops.cond(
        math_ops.equal(window_length, 1),
        lambda: array_ops.ones([1], dtype=dtype),
        lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
Ejemplo n.º 7
0
def angles_to_projective_transforms(angles,
                                    image_height,
                                    image_width,
                                    name=None):
    """Returns projective transform(s) for the given angle(s).

  Args:
    angles: A scalar angle to rotate all images by, or (for batches of images)
        a vector with an angle to rotate each image in the batch. The rank must
        be statically known (the shape is not `TensorShape(None)`.
    image_height: Height of the image(s) to be transformed.
    image_width: Width of the image(s) to be transformed.

  Returns:
    A tensor of shape (num_images, 8). Projective transforms which can be given
      to `tf.contrib.image.transform`.
  """
    with ops.name_scope(name, "angles_to_projective_transforms"):
        angle_or_angles = ops.convert_to_tensor(angles,
                                                name="angles",
                                                dtype=dtypes.float32)
        if len(angle_or_angles.get_shape()) == 0:  # pylint: disable=g-explicit-length-test
            angles = angle_or_angles[None]
        elif len(angle_or_angles.get_shape()) == 1:
            angles = angle_or_angles
        else:
            raise TypeError("Angles should have rank 0 or 1.")
        x_offset = ((image_width - 1) -
                    (math_ops.cos(angles) *
                     (image_width - 1) - math_ops.sin(angles) *
                     (image_height - 1))) / 2.0
        y_offset = ((image_height - 1) -
                    (math_ops.sin(angles) *
                     (image_width - 1) + math_ops.cos(angles) *
                     (image_height - 1))) / 2.0
        num_angles = array_ops.shape(angles)[0]
        return array_ops.concat(values=[
            math_ops.cos(angles)[:, None],
            -math_ops.sin(angles)[:, None],
            x_offset[:, None],
            math_ops.sin(angles)[:, None],
            math_ops.cos(angles)[:, None],
            y_offset[:, None],
            array_ops.zeros((num_angles, 2), dtypes.float32),
        ],
                                axis=1)
Ejemplo n.º 8
0
 def testBackwardOverForward(self, forward_prop_first):
     c = constant_op.constant(1.)
     # Watching depends depends on nesting, not creation order
     if forward_prop_first:
         forward_accumulator = forwardprop.ForwardAccumulator(c, .1)
         gradient_tape = backprop.GradientTape()
     else:
         gradient_tape = backprop.GradientTape()
         forward_accumulator = forwardprop.ForwardAccumulator(c, .1)
     with gradient_tape as tape:
         with forward_accumulator as acc:
             tape.watch(c)
             d = math_ops.cos(c)
             self.assertTrue(tape_lib.should_record_backprop(
                 (acc.jvp(d), )))
         self.assertAllClose(-.1 * math_ops.cos(1.),
                             tape.gradient(acc.jvp(d), c))
Ejemplo n.º 9
0
def _TanGrad(op, grad):
    """Returns grad * 1/sec^2(x)."""
    x = op.inputs[0]
    with ops.control_dependencies([grad.op]):
        x = math_ops.conj(x)
        secx = math_ops.reciprocal(math_ops.cos(x))
        secx2 = math_ops.square(secx)
        return grad * secx2
Ejemplo n.º 10
0
    def RFF_map(self, input_tensor, seed, stddev, input_shape, output_dim):
        #input_tensor = tf.concat([input_tensor_1, input_tensor_2], axis=1)
        #print("Information that the adversary can get: {}".format(input_tensor))


        #random_state = check_random_state(seed)
        gamma = stddev
        omega_matrix_shape = [input_shape, output_dim]
        bias_shape = [output_dim]

        """
        This is the tensorflow version RFF mapping, but I refer to the scikit-learn version.!!!!
        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

        np.random.seed(9)
        self._stddev = stddev
        omega_matrix_shape = [self.arg.dim*2, output_dim]
        bias_shape = [output_dim]

        omega_matrix = constant_op.constant(
            np.random.normal(
            scale=1.0 / self._stddev, size=omega_matrix_shape),
            dtype=dtypes.float32)

        bias = constant_op.constant(
            np.random.uniform(
            low=0.0, high=2 * np.pi, size=bias_shape),
            dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)

        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        """

        omega_matrix = constant_op.constant(np.sqrt(2 * gamma) *
           np.random.normal(size=omega_matrix_shape), dtype=dtypes.float32)

        bias = constant_op.constant(
            np.random.uniform(
            0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)

        '''
        omega_matrix = constant_op.constant(np.sqrt(2 * gamma) *
           random_state.normal(size=omega_matrix_shape),dtype=dtypes.float32)

        bias = constant_op.constant(
            random_state.uniform(
            0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)
        '''

        return math.sqrt(2.0 / output_dim) * math_ops.cos(x_omega_plus_bias)
Ejemplo n.º 11
0
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
                 name=None):
  """Applies cosine decay to the learning rate.

  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
  with Warm Restarts. https://arxiv.org/abs/1608.03983

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a cosine decay function
  to a provided initial learning rate.  It requires a `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
  decayed = (1 - alpha) * cosine_decay + alpha
  decayed_learning_rate = learning_rate * decayed
  ```

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
    alpha: A scalar `float32` or `float64` Tensor or a Python number.
      Minimum learning rate value as a fraction of learning_rate.
    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("cosine decay requires global_step")
  with ops.name_scope(name, "CosineDecay",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    global_step = math_ops.minimum(global_step, decay_steps)
    completed_fraction = global_step / decay_steps
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))

    decayed = (1 - alpha) * cosine_decayed + alpha
    return math_ops.multiply(learning_rate, decayed)
Ejemplo n.º 12
0
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
                 name=None):
  """Applies cosine decay to the learning rate.

  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
  with Warm Restarts. https://arxiv.org/abs/1608.03983

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a cosine decay function
  to a provided initial learning rate.  It requires a `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
  decayed = (1 - alpha) * cosine_decay + alpha
  decayed_learning_rate = learning_rate * decayed
  ```

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
    alpha: A scalar `float32` or `float64` Tensor or a Python number.
      Minimum learning rate value as a fraction of learning_rate.
    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("cosine decay requires global_step")
  with ops.name_scope(name, "CosineDecay",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    global_step = math_ops.minimum(global_step, decay_steps)
    completed_fraction = global_step / decay_steps
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))

    decayed = (1 - alpha) * cosine_decayed + alpha
    return math_ops.multiply(learning_rate, decayed)
Ejemplo n.º 13
0
 def testBatchBackwardOverForward(self, forward_prop_first):
   x = constant_op.constant(1.)
   tangents = random_ops.random_normal(shape=[10], seed=1)
   expected = [-t * math_ops.cos(1.) for t in tangents]
   if forward_prop_first:
     batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
     gradient_tape = backprop.GradientTape(persistent=True)
   else:
     gradient_tape = backprop.GradientTape(persistent=True)
     batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
   with gradient_tape as tape:
     with batch_acc as acc:
       tape.watch(x)
       y = math_ops.cos(x)
       self.assertTrue(tape_lib.should_record_backprop((acc.jvp(y),)))
       jvps = acc.jvp(y)
     d2y_dx2 = [tape.gradient(dy_dx, x) for dy_dx in jvps]
   self.assertAllClose(expected, d2y_dx2)
Ejemplo n.º 14
0
def angles_to_projective_transforms(angles,
                                    image_height,
                                    image_width,
                                    name=None):
  """Returns projective transform(s) for the given angle(s).

  Args:
    angles: A scalar angle to rotate all images by, or (for batches of images)
        a vector with an angle to rotate each image in the batch. The rank must
        be statically known (the shape is not `TensorShape(None)`.
    image_height: Height of the image(s) to be transformed.
    image_width: Width of the image(s) to be transformed.

  Returns:
    A tensor of shape (num_images, 8). Projective transforms which can be given
      to `tf.contrib.image.transform`.
  """
  with ops.name_scope(name, "angles_to_projective_transforms"):
    angle_or_angles = ops.convert_to_tensor(
        angles, name="angles", dtype=dtypes.float32)
    if len(angle_or_angles.get_shape()) == 0:  # pylint: disable=g-explicit-length-test
      angles = angle_or_angles[None]
    elif len(angle_or_angles.get_shape()) == 1:
      angles = angle_or_angles
    else:
      raise TypeError("Angles should have rank 0 or 1.")
    x_offset = ((image_width - 1) - (math_ops.cos(angles) *
                                     (image_width - 1) - math_ops.sin(angles) *
                                     (image_height - 1))) / 2.0
    y_offset = ((image_height - 1) - (math_ops.sin(angles) *
                                      (image_width - 1) + math_ops.cos(angles) *
                                      (image_height - 1))) / 2.0
    num_angles = array_ops.shape(angles)[0]
    return array_ops.concat(
        values=[
            math_ops.cos(angles)[:, None],
            -math_ops.sin(angles)[:, None],
            x_offset[:, None],
            math_ops.sin(angles)[:, None],
            math_ops.cos(angles)[:, None],
            y_offset[:, None],
            array_ops.zeros((num_angles, 2), dtypes.float32),
        ],
        axis=1)
 def Test(self):
     np.random.seed(1)
     n = shape_[-1]
     batch_shape = shape_[:-2]
     np_dtype = dtype_.as_numpy_dtype
     a = np.random.uniform(low=-1.0, high=1.0,
                           size=n * n).reshape([n, n]).astype(np_dtype)
     if dtype_.is_complex:
         a += 1j * np.random.uniform(low=-1.0, high=1.0, size=n *
                                     n).reshape([n, n]).astype(np_dtype)
     a += np.conj(a.T)
     a = np.tile(a, batch_shape + (1, 1))
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(np_dtype).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
     # tolerance obtained by looking at actual differences using
     # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
     if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
         tol = 1e-2
     else:
         tol = 1e-7
     with self.session(use_gpu=True):
         tf_a = constant_op.constant(a)
         if compute_v_:
             tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
             # (complex) Eigenvectors are only unique up to an arbitrary phase
             # We normalize the vectors such that the first component has phase 0.
             top_rows = tf_v[..., 0:1, :]
             if tf_a.dtype.is_complex:
                 angle = -math_ops.angle(top_rows)
                 phase = math_ops.complex(math_ops.cos(angle),
                                          math_ops.sin(angle))
             else:
                 phase = math_ops.sign(top_rows)
             tf_v *= phase
             outputs = [tf_e, tf_v]
         else:
             tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
             outputs = [tf_e]
         for b in outputs:
             x_init = np.random.uniform(low=-1.0, high=1.0, size=n *
                                        n).reshape([n, n]).astype(np_dtype)
             if dtype_.is_complex:
                 x_init += 1j * np.random.uniform(
                     low=-1.0, high=1.0, size=n * n).reshape(
                         [n, n]).astype(np_dtype)
             x_init += np.conj(x_init.T)
             x_init = np.tile(x_init, batch_shape + (1, 1))
             theoretical, numerical = gradient_checker.compute_gradient(
                 tf_a,
                 tf_a.get_shape().as_list(),
                 b,
                 b.get_shape().as_list(),
                 x_init_value=x_init,
                 delta=delta)
             self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
Ejemplo n.º 16
0
    def __call__(self, step):
      with ops.name_scope(self.name, "SGDRLearningRate", [self.learning_rate, step, self.t_0, self.t_mul, self.m_mul]) as name:
        learning_rate = ops.convert_to_tensor(self.learning_rate, name="learning_rate")
        dtype = learning_rate.dtype
        step = math_ops.cast(step, dtype)
        t_0 = math_ops.cast(self.t_0, dtype)
        t_mul = math_ops.cast(self.t_mul, dtype)
        m_mul = math_ops.cast(self.m_mul, dtype)

        c_one = math_ops.cast(constant_op.constant(1.0), dtype)
        c_half = math_ops.cast(constant_op.constant(0.5), dtype)
        c_pi = math_ops.cast(constant_op.constant(math.pi), dtype)

        # Find normalized value of the current step
        x_val = math_ops.div(step, t_0)

        def compute_step(x_val, geometric=False):
          if geometric:
            # Consider geometric series where t_mul != 1
            # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul)

            # First find how many restarts were performed for a given x_val
            # Find maximal integer i_restart value for which this equation holds
            # x_val >= (1 - t_mul^i_restart) / (1 - t_mul)
            # x_val * (1 - t_mul) <= (1 - t_mul^i_restart)
            # t_mul^i_restart <= (1 - x_val * (1 - t_mul))

            # tensorflow allows only log with base e
            # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul)
            # Find how many restarts were performed

            i_restart = math_ops.floor(math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul))
            # Compute the sum of all restarts before the current one
            sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul)
            # Compute our position within the current restart
            x_val = (x_val - sum_r) / t_mul ** i_restart

          else:
            # Find how many restarts were performed
            i_restart = math_ops.floor(x_val)
            # Compute our position within the current restart
            x_val = x_val - i_restart
            
          return i_restart, x_val

        i_restart, x_val = control_flow_ops.cond(
            math_ops.equal(t_mul, c_one),
            lambda: compute_step(x_val, geometric=False),
            lambda: compute_step(x_val, geometric=True)
        )

        # If m_mul < 1, then the initial learning rate of every new restart will be
        # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart
        m_fac = learning_rate * (m_mul ** i_restart)

        return math_ops.multiply(c_half * m_fac, (math_ops.cos(x_val * c_pi) + c_one), name=name)
Ejemplo n.º 17
0
    def _add_sinusoids_signal(x, time, min_timescale=1.0, max_timescale=1.0e4):
        """Adds a bunch of sinusoids of different frequencies to a Tensor.

        Each channel of the input Tensor is incremented by a sinusoid of a different
        frequency and phase.

        This allows attention to learn to use absolute and relative positions.
        Timing signals should be added to some precursors of both the query and the
        memory inputs to attention.

        The use of relative position is possible because sin(x+y) and cos(x+y) can be
        experessed in terms of y, sin(x) and cos(x).

        In particular, we use a geometric sequence of timescales starting with
        min_timescale and ending with max_timescale.  The number of different
        timescales is equal to channels / 2. For each timescale, we
        generate the two sinusoidal signals sin(timestep/timescale) and
        cos(timestep/timescale).  All of these sinusoids are concatenated in
        the channels dimension.

        Args:
          x: a Tensor with shape [batch, length, channels]
          min_timescale: a float
          max_timescale: a float

        Returns:
          a Tensor the same shape as x.
        """
        channels = x.get_shape().as_list()[-1]
        if x.get_shape().ndims == 3:  # [batch_size, timesteps, dim]
            length = array_ops.shape(x)[1]
            position = math_ops.to_float(math_ops.range(length))
        elif x.get_shape().ndims == 2:  # [batch_size, dim]
            length = 1
            position = math_ops.to_float(math_ops.range(time, time + 1))
        else:
            raise ValueError("need a Tensor with rank 2 or 3")
        num_timescales = channels // 2
        log_timescale_increment = (
            math.log(float(max_timescale) / float(min_timescale)) /
            (math_ops.to_float(num_timescales) - 1))
        inv_timescales = min_timescale * math_ops.exp(
            math_ops.to_float(math_ops.range(num_timescales)) *
            -log_timescale_increment)
        scaled_time = array_ops.expand_dims(
            position, 1) * array_ops.expand_dims(inv_timescales, 0)
        signal = array_ops.concat(
            [math_ops.sin(scaled_time),
             math_ops.cos(scaled_time)], axis=1)
        signal = array_ops.pad(signal,
                               [[0, 0], [0, math_ops.mod(channels, 2)]])
        if x.get_shape().ndims == 3:
            signal = array_ops.reshape(signal, [1, length, channels])
        else:
            signal = array_ops.reshape(signal, [1, channels])
        return x + signal
Ejemplo n.º 18
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - alpha) * cosine_decayed + alpha
      return math_ops.multiply(learning_rate, decayed)
Ejemplo n.º 19
0
    def map(self, input_tensor):
        """Maps each row of input_tensor using random Fourier features.

    Args:
      input_tensor: a `Tensor` containing input features. It's shape is
      [batch_size, self._input_dim].

    Returns:
      A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped
      features.

    Raises:
      InvalidShapeError: if the shape of the `input_tensor` is inconsistent with
        expected input dimension.
    """
        input_tensor_shape = input_tensor.get_shape()
        if len(input_tensor_shape) != 2:
            raise dkm.InvalidShapeError(
                'The shape of the tensor should be 2. Got %d instead.' %
                len(input_tensor_shape))

        features_dim = input_tensor_shape[1]
        if features_dim != self._input_dim:
            raise dkm.InvalidShapeError(
                'Invalid dimension: expected %d input features, got %d instead.'
                % (self._input_dim, features_dim))

        # Add ops that compute (deterministically) omega_matrix and bias based on
        # the provided seed.
        # TODO (sibyl-vie3Poto): Storing the mapper's parameters (omega_matrix and bias) as id:626 gh:627
        # constants incurs no RPC calls to the parameter server during distributed
        # training. However, if the parameters grow too large (for instance if they
        # don't fit into memory or if they blow up the size of the GraphDef proto),
        # stroring them as constants is no longer an option. In this case, we should
        # have a heuristic to choose out of one of the following alternatives:
        # a) store them as variables (in the parameter server)
        # b) store them as worker local variables
        # c) generating on the fly the omega matrix at each step
        np.random.seed(self._seed)
        omega_matrix_shape = [self._input_dim, self._output_dim]
        bias_shape = [self._output_dim]

        omega_matrix = constant_op.constant(np.random.normal(
            scale=1.0 / self._stddev, size=omega_matrix_shape),
                                            dtype=dtypes.float32)
        bias = constant_op.constant(np.random.uniform(low=0.0,
                                                      high=2 * np.pi,
                                                      size=bias_shape),
                                    dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)
        return math.sqrt(
            2.0 / self._output_dim) * math_ops.cos(x_omega_plus_bias)
Ejemplo n.º 20
0
 def compute_step(warming_up=False):
     if warming_up:
         completed_fraction = global_step_recomp / warmup_steps
         gain = w_fac + (1 - w_fac) * completed_fraction
     else:
         completed_fraction = (global_step_recomp - warmup_steps
                               ) / (decay_steps - warmup_steps)
         cosine_decayed = 0.5 * (1.0 + math_ops.cos(
             constant_op.constant(math.pi) * completed_fraction))
         gain = (1 - self.alpha) * cosine_decayed + self.alpha
     return gain
 def Test(self):
   np.random.seed(1)
   n = shape_[-1]
   batch_shape = shape_[:-2]
   np_dtype = dtype_.as_numpy_dtype
   a = np.random.uniform(
       low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
   if dtype_.is_complex:
     a += 1j * np.random.uniform(
         low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
   a += np.conj(a.T)
   a = np.tile(a, batch_shape + (1, 1))
   # Optimal stepsize for central difference is O(epsilon^{1/3}).
   epsilon = np.finfo(np_dtype).eps
   delta = 0.1 * epsilon**(1.0 / 3.0)
   # tolerance obtained by looking at actual differences using
   # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
   if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
     tol = 1e-2
   else:
     tol = 1e-7
   with self.session(use_gpu=True):
     tf_a = constant_op.constant(a)
     if compute_v_:
       tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
       # (complex) Eigenvectors are only unique up to an arbitrary phase
       # We normalize the vectors such that the first component has phase 0.
       top_rows = tf_v[..., 0:1, :]
       if tf_a.dtype.is_complex:
         angle = -math_ops.angle(top_rows)
         phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
       else:
         phase = math_ops.sign(top_rows)
       tf_v *= phase
       outputs = [tf_e, tf_v]
     else:
       tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
       outputs = [tf_e]
     for b in outputs:
       x_init = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
       if dtype_.is_complex:
         x_init += 1j * np.random.uniform(
             low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
       x_init += np.conj(x_init.T)
       x_init = np.tile(x_init, batch_shape + (1, 1))
       theoretical, numerical = gradient_checker.compute_gradient(
           tf_a,
           tf_a.get_shape().as_list(),
           b,
           b.get_shape().as_list(),
           x_init_value=x_init,
           delta=delta)
       self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
Ejemplo n.º 22
0
  def map(self, input_tensor):
    """Maps each row of input_tensor using random Fourier features.

    Args:
      input_tensor: a `Tensor` containing input features. It's shape is
      [batch_size, self._input_dim].

    Returns:
      A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped
      features.

    Raises:
      InvalidShapeError: if the shape of the `input_tensor` is inconsistent with
        expected input dimension.
    """
    input_tensor_shape = input_tensor.get_shape()
    if len(input_tensor_shape) != 2:
      raise dkm.InvalidShapeError(
          'The shape of the tensor should be 2. Got %d instead.' %
          len(input_tensor_shape))

    features_dim = input_tensor_shape[1]
    if features_dim != self._input_dim:
      raise dkm.InvalidShapeError(
          'Invalid dimension: expected %d input features, got %d instead.' %
          (self._input_dim, features_dim))

    # Add ops that compute (deterministically) omega_matrix and bias based on
    # the provided seed.
    # TODO(sibyl-vie3Poto): Storing the mapper's parameters (omega_matrix and bias) as
    # constants incurs no RPC calls to the parameter server during distributed
    # training. However, if the parameters grow too large (for instance if they
    # don't fit into memory or if they blow up the size of the GraphDef proto),
    # stroring them as constants is no longer an option. In this case, we should
    # have a heuristic to choose out of one of the following alternatives:
    # a) store them as variables (in the parameter server)
    # b) store them as worker local variables
    # c) generating on the fly the omega matrix at each step
    np.random.seed(self._seed)
    omega_matrix_shape = [self._input_dim, self._output_dim]
    bias_shape = [self._output_dim]

    omega_matrix = constant_op.constant(
        np.random.normal(
            scale=1.0 / self._stddev, size=omega_matrix_shape),
        dtype=dtypes.float32)
    bias = constant_op.constant(
        np.random.uniform(
            low=0.0, high=2 * np.pi, size=bias_shape),
        dtype=dtypes.float32)

    x_omega_plus_bias = math_ops.add(
        math_ops.matmul(input_tensor, omega_matrix), bias)
    return math.sqrt(2.0 / self._output_dim) * math_ops.cos(x_omega_plus_bias)
Ejemplo n.º 23
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 def Compute(x):
   e, v = linalg_ops.self_adjoint_eig(x)
   # (complex) Eigenvectors are only unique up to an arbitrary phase
   # We normalize the vectors such that the first component has phase 0.
   top_rows = v[..., 0:1, :]
   if dtype_.is_complex:
     angle = -math_ops.angle(top_rows)
     phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
   else:
     phase = math_ops.sign(top_rows)
   v *= phase
   return e, v
Ejemplo n.º 25
0
 def cosine_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for cosine_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   completed_fraction = math_ops.to_float(global_step) / math_ops.to_float(
       decay_steps)
   fraction = 2.0 * num_periods * completed_fraction
   decayed = 0.5 * (
       1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
   if zero_after is not None:
     decayed = array_ops.where(
         math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed)
   return decayed
Ejemplo n.º 26
0
 def cosine_decay_fn(global_step):
     if global_step is None:
         raise ValueError("global_step is required for cosine_decay.")
     global_step = math_ops.minimum(global_step, decay_steps)
     completed_fraction = math_ops.to_float(
         global_step) / math_ops.to_float(decay_steps)
     fraction = 2.0 * num_periods * completed_fraction
     decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
     if zero_after is not None:
         decayed = array_ops.where(
             math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed)
     return decayed
Ejemplo n.º 27
0
def get_rotation_matrix(angles, image_height, image_width, name=None):
    """Returns projective transform(s) for the given angle(s).

  Args:
    angles: A scalar angle to rotate all images by, or (for batches of images) a
      vector with an angle to rotate each image in the batch. The rank must be
      statically known (the shape is not `TensorShape(None)`).
    image_height: Height of the image(s) to be transformed.
    image_width: Width of the image(s) to be transformed.
    name: The name of the op.

  Returns:
    A tensor of shape (num_images, 8). Projective transforms which can be given
      to operation `image_projective_transform_v2`. If one row of transforms is
       [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
       `(x, y)` to a transformed *input* point
       `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
       where `k = c0 x + c1 y + 1`.
  """
    with ops.name_scope(name, 'rotation_matrix'):
        x_offset = ((image_width - 1) -
                    (math_ops.cos(angles) *
                     (image_width - 1) - math_ops.sin(angles) *
                     (image_height - 1))) / 2.0
        y_offset = ((image_height - 1) -
                    (math_ops.sin(angles) *
                     (image_width - 1) + math_ops.cos(angles) *
                     (image_height - 1))) / 2.0
        num_angles = array_ops.shape(angles)[0]
        return array_ops.concat(values=[
            math_ops.cos(angles)[:, None],
            -math_ops.sin(angles)[:, None],
            x_offset[:, None],
            math_ops.sin(angles)[:, None],
            math_ops.cos(angles)[:, None],
            y_offset[:, None],
            array_ops.zeros((num_angles, 2), dtypes.float32),
        ],
                                axis=1)
Ejemplo n.º 28
0
    def _add_sinusoids_signal(x, time, min_timescale=1.0, max_timescale=1.0e4):
        """Adds a bunch of sinusoids of different frequencies to a Tensor.

        Each channel of the input Tensor is incremented by a sinusoid of a different
        frequency and phase.

        This allows attention to learn to use absolute and relative positions.
        Timing signals should be added to some precursors of both the query and the
        memory inputs to attention.

        The use of relative position is possible because sin(x+y) and cos(x+y) can be
        experessed in terms of y, sin(x) and cos(x).

        In particular, we use a geometric sequence of timescales starting with
        min_timescale and ending with max_timescale.  The number of different
        timescales is equal to channels / 2. For each timescale, we
        generate the two sinusoidal signals sin(timestep/timescale) and
        cos(timestep/timescale).  All of these sinusoids are concatenated in
        the channels dimension.

        Args:
          x: a Tensor with shape [batch, length, channels]
          min_timescale: a float
          max_timescale: a float

        Returns:
          a Tensor the same shape as x.
        """
        channels = x.get_shape().as_list()[-1]
        if x.get_shape().ndims == 3:  # [batch_size, timesteps, dim]
            length = array_ops.shape(x)[1]
            position = math_ops.to_float(math_ops.range(length))
        elif x.get_shape().ndims == 2:  # [batch_size, dim]
            length = 1
            position = math_ops.to_float(math_ops.range(time, time + 1))
        else:
            raise ValueError("need a Tensor with rank 2 or 3")
        num_timescales = channels // 2
        log_timescale_increment = (
            math.log(float(max_timescale) / float(min_timescale)) /
            (math_ops.to_float(num_timescales) - 1))
        inv_timescales = min_timescale * math_ops.exp(
            math_ops.to_float(math_ops.range(num_timescales)) * -log_timescale_increment)
        scaled_time = array_ops.expand_dims(position, 1) * array_ops.expand_dims(inv_timescales, 0)
        signal = array_ops.concat([math_ops.sin(scaled_time), math_ops.cos(scaled_time)], axis=1)
        signal = array_ops.pad(signal, [[0, 0], [0, math_ops.mod(channels, 2)]])
        if x.get_shape().ndims == 3:
            signal = array_ops.reshape(signal, [1, length, channels])
        else:
            signal = array_ops.reshape(signal, [1, channels])
        return x + signal
  def __call__(self, step):
    with ops.name_scope_v2(self.name or "CosineDecay"):
      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
      return math_ops.multiply(initial_learning_rate, decayed)
Ejemplo n.º 30
0
  def __call__(self, step):
    with ops.name_scope_v2(self.name or "CosineDecay"):
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
      return math_ops.multiply(initial_learning_rate, decayed)
Ejemplo n.º 31
0
 def _sample_n(self, n, seed=None):
     # We use 2 uniform random floats to generate polar random variates.
     # http://dl.acm.org/citation.cfm?id=179631
     # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1].
     # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0.
     # Let X = R*cos(theta), and let Y = R*sin(theta).
     # Then X ~ t_df and Y ~ t_df.
     # The variates X and Y are not independent.
     shape = array_ops.concat(0, ([2, n], self.batch_shape()))
     uniform = random_ops.random_uniform(shape=shape, dtype=self.dtype, seed=seed)
     samples_g, samples_h = array_ops.unpack(uniform, num=2)
     theta = (2.0 * math.pi) * samples_h
     r = math_ops.sqrt(self.df * (math_ops.pow(samples_g, -2 / self.df) - 1))
     samples = r * math_ops.cos(theta)
     return samples * self.sigma + self.mu
Ejemplo n.º 32
0
 def restart_decay_fn(global_step):
     if global_step is None:
         raise ValueError("global_step is required for cosine_decay.")
     global_step = math_ops.minimum(global_step, decay_steps)
     num = math_ops.mod(num_periods * math_ops.to_float(global_step),
                        decay_steps)
     fraction = num / math_ops.to_float(decay_steps)
     decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
     if zero_after is not None:
         tmp = math_ops.to_float(
             num_periods * global_step) / math_ops.to_float(decay_steps)
         decayed = array_ops.where(math_ops.greater_equal(tmp, zero_after),
                                   0.0, decayed)
     return decayed
Ejemplo n.º 33
0
 def restart_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for cosine_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   num = math_ops.mod(num_periods * math_ops.to_float(global_step),
                      decay_steps)
   fraction = num / math_ops.to_float(decay_steps)
   decayed = 0.5 * (
       1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
   if zero_after is not None:
     tmp = math_ops.to_float(
         num_periods * global_step) / math_ops.to_float(decay_steps)
     decayed = array_ops.where(
         math_ops.greater_equal(tmp, zero_after), 0.0, decayed)
   return decayed
Ejemplo n.º 34
0
def getRotatePoint(map_shape, rotate_center, rotate_theta, origin_point):
    """
    实现功能,得到绕旋转中心旋转theta角度后的坐标
    :param map_shape:原始地图的尺寸,因为Image中的坐标原点在图片左上角,需要改变坐标系    Tensor-[height,width,channel]
    :param rotate_center:旋转中心   Tensor-[loc_x,loc_y]
    :param rotate_theta:旋转角度   Tensor-[theta]
    :param origin_point:需要进行旋转操作的点集 Tensor-[loc_x,loc_y]
    :return: rotate_point_list: Tensor-[loc_x,loc_y]
    """
    row = map_shape[0]
    center_x = rotate_center[0]
    center_y = row - rotate_center[1]
    point_x = origin_point[0]
    point_y = row - origin_point[1]

    after_rotate_x = math_ops.round(
        (point_x - center_x) * math_ops.cos(rotate_theta) -
        (point_y - center_y) * math_ops.sin(rotate_theta) + center_x)
    after_rotate_y = row - math_ops.round(
        (point_x - center_x) * math_ops.sin(rotate_theta) +
        (point_y - center_y) * math_ops.cos(rotate_theta) + center_y)
    rotate_point = [after_rotate_x, after_rotate_y]
    rotate_point = tf.reshape(rotate_point, [2])
    return rotate_point
Ejemplo n.º 35
0
  def decayed_lr(learning_rate, global_step, decay_steps, alpha, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "CosineDecay",
                        [learning_rate, global_step]) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      decay_steps = math_ops.cast(decay_steps, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - alpha) * cosine_decayed + alpha
      return math_ops.multiply(learning_rate, decayed)
Ejemplo n.º 36
0
  def decayed_lr(learning_rate, global_step, decay_steps, alpha, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "CosineDecay",
                        [learning_rate, global_step]) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      decay_steps = math_ops.cast(decay_steps, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - alpha) * cosine_decayed + alpha
      return math_ops.multiply(learning_rate, decayed)
Ejemplo n.º 37
0
 def f(x, order):
     with backprop.GradientTape(persistent=persistent) as tape:
         tape.watch(x)
         # Note that having a tape active, even if we don't use it, forces us
         # down a different function call path. Symbolic gradients should work
         # here too; correctness of tape gradients are tested elsewhere.
         y = def_function.function(lambda: math_ops.cos(x))()
     tape_dy = tape.gradient(y, x)
     for _ in range(order):
         y, = gradients_impl.gradients(y, [x])
     if order > 0:
         y1 = tape_dy
         for _ in range(order - 1):
             y1, = gradients_impl.gradients(y1, [x])
     else:
         y1 = y
     return y, y1
Ejemplo n.º 38
0
      def Compute(x):
        e, v = linalg_ops.eig(x)

        # We sort eigenvalues by e.real+e.imag to have consistent
        # order between runs
        b_dims = len(e.shape) - 1
        idx = sort_ops.argsort(math_ops.real(e) + math_ops.imag(e), axis=-1)
        e = array_ops.gather(e, idx, batch_dims=b_dims)
        v = array_ops.gather(v, idx, batch_dims=b_dims)

        # (complex) Eigenvectors are only unique up to an arbitrary phase
        # We normalize the vectors such that the first component has phase 0.
        top_rows = v[..., 0:1, :]
        angle = -math_ops.angle(top_rows)
        phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
        v *= phase
        return e, v
Ejemplo n.º 39
0
 def _sample_n(self, n, seed=None):
     # We use 2 uniform random floats to generate polar random variates.
     # http://dl.acm.org/citation.cfm?id=179631
     # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1].
     # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0.
     # Let X = R*cos(theta), and let Y = R*sin(theta).
     # Then X ~ t_df and Y ~ t_df.
     # The variates X and Y are not independent.
     shape = array_ops.concat(0, ([2, n], self.batch_shape()))
     uniform = random_ops.random_uniform(shape=shape,
                                         dtype=self.dtype,
                                         seed=seed)
     samples_g, samples_h = array_ops.unpack(uniform, num=2)
     theta = (2. * math.pi) * samples_h
     r = math_ops.sqrt(self.df *
                       (math_ops.pow(samples_g, -2 / self.df) - 1))
     samples = r * math_ops.cos(theta)
     return samples * self.sigma + self.mu
Ejemplo n.º 40
0
def _raised_cosine_window(name, default_name, window_length, periodic, dtype,
                          a, b):
    """Helper function for computing a raised cosine window.

  Args:
    name: Name to use for the scope.
    default_name: Default name to use for the scope.
    window_length: A scalar `Tensor` or integer indicating the window length.
    periodic: A bool `Tensor` indicating whether to generate a periodic or
      symmetric window.
    dtype: A floating point `DType`.
    a: The alpha parameter to the raised cosine window.
    b: The beta parameter to the raised cosine window.

  Returns:
    A `Tensor` of shape `[window_length]` of type `dtype`.

  Raises:
    ValueError: If `dtype` is not a floating point type or `window_length` is
      not scalar or `periodic` is not scalar.
  """
    if not dtype.is_floating:
        raise ValueError('dtype must be a floating point type. Found %s' %
                         dtype)

    with ops.name_scope(name, default_name, [window_length, periodic]):
        window_length = ops.convert_to_tensor(window_length,
                                              dtype=dtypes.int32,
                                              name='window_length')
        window_length.shape.assert_has_rank(0)
        periodic = math_ops.cast(
            ops.convert_to_tensor(periodic, dtype=dtypes.bool,
                                  name='periodic'), dtypes.int32)
        periodic.shape.assert_has_rank(0)
        even = 1 - math_ops.mod(window_length, 2)

        n = math_ops.cast(window_length + periodic * even - 1, dtype=dtype)
        count = math_ops.cast(math_ops.range(window_length), dtype)
        cos_arg = constant_op.constant(2 * np.pi, dtype=dtype) * count / n

        return control_flow_ops.cond(
            math_ops.equal(window_length,
                           1), lambda: array_ops.ones([1], dtype=dtype),
            lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
Ejemplo n.º 41
0
    def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul,
                   alpha, name):
        """Helper to recompute learning rate; most helpful in eager-mode."""
        with ops.name_scope(name, "SGDRDecay",
                            [learning_rate, global_step]) as name:
            learning_rate = ops.convert_to_tensor(learning_rate,
                                                  name="initial_learning_rate")
            dtype = learning_rate.dtype
            first_decay_steps = math_ops.cast(first_decay_steps, dtype)
            alpha = math_ops.cast(alpha, dtype)
            t_mul = math_ops.cast(t_mul, dtype)
            m_mul = math_ops.cast(m_mul, dtype)

            global_step_recomp = math_ops.cast(global_step, dtype)
            completed_fraction = global_step_recomp / first_decay_steps

            def compute_step(completed_fraction, geometric=False):
                """Helper for `cond` operation."""
                if geometric:
                    i_restart = math_ops.floor(
                        math_ops.log(1.0 - completed_fraction *
                                     (1.0 - t_mul)) / math_ops.log(t_mul))

                    sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
                    completed_fraction = (completed_fraction -
                                          sum_r) / t_mul**i_restart

                else:
                    i_restart = math_ops.floor(completed_fraction)
                    completed_fraction -= i_restart

                return i_restart, completed_fraction

            i_restart, completed_fraction = control_flow_ops.cond(
                math_ops.equal(t_mul, 1.0),
                lambda: compute_step(completed_fraction, geometric=False),
                lambda: compute_step(completed_fraction, geometric=True))

            m_fac = m_mul**i_restart
            cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
                constant_op.constant(math.pi) * completed_fraction))
            decayed = (1 - alpha) * cosine_decayed + alpha

            return math_ops.multiply(learning_rate, decayed, name=name)
Ejemplo n.º 42
0
        def scaled_add_op(x, scale, y):
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([SIZE])],
            }
            base_dir = os.path.join(cwd, "tensorflow/python/ipu")
            gp_path = os.path.join(base_dir,
                                   "tests/add_scaled_vector_add_codelet.cc")
            lib_path = os.path.join(base_dir,
                                    "libadd_partial_gradients_custom.so")

            return ipu.custom_ops.precompiled_user_op(
                [x, scale, y, math_ops.cos(x),
                 math_ops.cosh(y)],
                lib_path,
                gp_path,
                outs=outputs,
                inputs_with_gradients=[0, 2])
Ejemplo n.º 43
0
 def _NormalizingSvd(tf_a):
   tf_s, tf_u, tf_v = linalg_ops.svd(tf_a, compute_uv=True, full_matrices=True)
   # Singular vectors are only unique up to an arbitrary phase. We normalize
   # the vectors such that the first component of u (if m >=n) or v (if n > m)
   # have phase 0.
   m = tf_a.shape[-2]
   n = tf_a.shape[-1]
   if m >= n:
     top_rows = tf_u[..., 0:1, :]
   else:
     top_rows = tf_v[..., 0:1, :]
   if tf_u.dtype.is_complex:
     angle = -math_ops.angle(top_rows)
     phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
   else:
     phase = math_ops.sign(top_rows)
   tf_u *= phase[..., :m]
   tf_v *= phase[..., :n]
   return tf_s, tf_u, tf_v
Ejemplo n.º 44
0
def cosine_decay(learning_rate, global_step, maximum_steps, name=None):
    """
  """
    from tensorflow.python.ops import math_ops
    from tensorflow.python.framework import ops

    if global_step is None:
        raise ValueError("global_step is required for cosine_decay.")
    with ops.name_scope(name, "CosineDecay",
                        [learning_rate, global_step, maximum_steps]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        maximum_steps = math_ops.cast(maximum_steps, dtype)

        p = tf.mod(global_step / maximum_steps, 1)

        return learning_rate * (0.5 + 0.5 * math_ops.cos(p * np.pi))
Ejemplo n.º 45
0
  def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul,
                 alpha, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]
                       ) as name:
      learning_rate = ops.convert_to_tensor(
          learning_rate, name="initial_learning_rate")
      dtype = learning_rate.dtype
      first_decay_steps = math_ops.cast(first_decay_steps, dtype)
      alpha = math_ops.cast(alpha, dtype)
      t_mul = math_ops.cast(t_mul, dtype)
      m_mul = math_ops.cast(m_mul, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      completed_fraction = global_step_recomp / first_decay_steps

      def compute_step(completed_fraction, geometric=False):
        """Helper for `cond` operation."""
        if geometric:
          i_restart = math_ops.floor(
              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
              math_ops.log(t_mul))

          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart

        else:
          i_restart = math_ops.floor(completed_fraction)
          completed_fraction -= i_restart

        return i_restart, completed_fraction

      i_restart, completed_fraction = control_flow_ops.cond(
          math_ops.equal(t_mul, 1.0),
          lambda: compute_step(completed_fraction, geometric=False),
          lambda: compute_step(completed_fraction, geometric=True))

      m_fac = m_mul**i_restart
      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))
      decayed = (1 - alpha) * cosine_decayed + alpha

      return math_ops.multiply(learning_rate, decayed, name=name)
Ejemplo n.º 46
0
    def __call__(self, step):
        with ops.name_scope_v2(self.name or "SGDRDecay") as name:
            initial_learning_rate = ops.convert_to_tensor_v2(
                self.initial_learning_rate, name="initial_learning_rate")
            dtype = initial_learning_rate.dtype
            first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
            k_decay = math_ops.cast(self.k_decay, dtype)
            alpha = math_ops.cast(self.alpha, dtype)
            t_mul = math_ops.cast(self._t_mul, dtype)
            m_mul = math_ops.cast(self._m_mul, dtype)

            global_step_recomp = math_ops.cast(step, dtype)
            completed_fraction = global_step_recomp / first_decay_steps

            def compute_step(completed_fraction, geometric=False):
                """Helper for `cond` operation."""
                if geometric:
                    i_restart = math_ops.floor(
                        math_ops.log(1.0 - completed_fraction *
                                     (1.0 - t_mul)) / math_ops.log(t_mul))

                    sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
                    completed_fraction = (completed_fraction -
                                          sum_r) / t_mul**i_restart

                else:
                    i_restart = math_ops.floor(completed_fraction)
                    completed_fraction -= i_restart

                return i_restart, completed_fraction

            i_restart, completed_fraction = control_flow_ops.cond(
                math_ops.equal(t_mul, 1.0),
                lambda: compute_step(completed_fraction, geometric=False),
                lambda: compute_step(completed_fraction, geometric=True))

            m_fac = m_mul**i_restart
            cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
                constant_op.constant(math.pi) * math_ops.pow(completed_fraction, k_decay)))
            decayed = (1 - alpha) * cosine_decayed + alpha

            return math_ops.multiply(initial_learning_rate, decayed, name=name)
Ejemplo n.º 47
0
    def sample(self, n, seed=None, name="sample"):
        """Sample `n` observations from the Student t Distributions.

    Args:
      n: `Scalar`, type int32, the number of observations to sample.
      seed: Python integer, the random seed.
      name: The name to give this op.

    Returns:
      samples: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
          with values of type `self.dtype`.
    """
        with ops.op_scope([self._df, self._mu, self._sigma, n], self.name):
            with ops.name_scope(name):
                n = ops.convert_to_tensor(n, name="n")
                n_val = tensor_util.constant_value(n)

                # We use 2 uniform random floats to generate polar random variates.
                # http://dl.acm.org/citation.cfm?id=179631
                # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1].
                # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0.
                # Let X = R*cos(theta), and let Y = R*sin(theta).
                # Then X ~ t_df and Y ~ t_df.
                # The variates X and Y are not independent.
                shape = array_ops.concat(
                    0, [array_ops.pack([2, n]),
                        self.batch_shape()])
                uniform = random_ops.random_uniform(shape=shape,
                                                    dtype=self.dtype,
                                                    seed=seed)
                samples_g, samples_h = array_ops.unpack(uniform, num=2)
                theta = (2 * np.pi) * samples_h
                r = math_ops.sqrt(self._df *
                                  (math_ops.pow(samples_g, -2 / self._df) - 1))
                samples = r * math_ops.cos(theta)

                # Provide some hints to shape inference
                inferred_shape = tensor_shape.vector(n_val).concatenate(
                    self.get_batch_shape())
                samples.set_shape(inferred_shape)

                return samples * self._sigma + self._mu
Ejemplo n.º 48
0
    def RFF_map(self, input_tensor, seed, stddev, output_dim): 
        """
        Refer to the scikit learn package "RFF sampler" and tensorflow RFF mapping.
        """

        random_state = check_random_state(seed)
        gamma = stddev
        omega_matrix_shape = [3072, output_dim]
        bias_shape = [output_dim]
        """
        Tensorflow Version is elaborated below:

        np.random.seed(9)
        self._stddev = stddev
        omega_matrix_shape = [self.arg.dim*2, output_dim]
        bias_shape = [output_dim]

        omega_matrix = constant_op.constant(
            np.random.normal(
            scale=1.0 / self._stddev, size=omega_matrix_shape),
            dtype=dtypes.float32)

        bias = constant_op.constant(
            np.random.uniform(
            low=0.0, high=2 * np.pi, size=bias_shape),
            dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)
        """

        omega_matrix = constant_op.constant(np.sqrt(2 * gamma) *
           random_state.normal(size=omega_matrix_shape), dtype=dtypes.float32)

        bias = constant_op.constant(
            random_state.uniform(
            0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32)

        x_omega_plus_bias = math_ops.add(
            math_ops.matmul(input_tensor, omega_matrix), bias)

        return math.sqrt(2.0 / output_dim) * math_ops.cos(x_omega_plus_bias)
Ejemplo n.º 49
0
  def sample(self, n, seed=None, name="sample"):
    """Sample `n` observations from the Student t Distributions.

    Args:
      n: `Scalar`, type int32, the number of observations to sample.
      seed: Python integer, the random seed.
      name: The name to give this op.

    Returns:
      samples: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
          with values of type `self.dtype`.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self._df, self._mu, self._sigma, n], name):
        n = ops.convert_to_tensor(n, name="n")
        n_val = tensor_util.constant_value(n)

        # We use 2 uniform random floats to generate polar random variates.
        # http://dl.acm.org/citation.cfm?id=179631
        # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1].
        # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0.
        # Let X = R*cos(theta), and let Y = R*sin(theta).
        # Then X ~ t_df and Y ~ t_df.
        # The variates X and Y are not independent.
        shape = array_ops.concat(0, [array_ops.pack([2, n]),
                                     self.batch_shape()])
        uniform = random_ops.random_uniform(shape=shape,
                                            dtype=self.dtype,
                                            seed=seed)
        samples_g, samples_h = array_ops.unpack(uniform, num=2)
        theta = (2 * np.pi) * samples_h
        r = math_ops.sqrt(self._df *
                          (math_ops.pow(samples_g, -2 / self._df) - 1))
        samples = r * math_ops.cos(theta)

        # Provide some hints to shape inference
        inferred_shape = tensor_shape.vector(n_val).concatenate(
            self.get_batch_shape())
        samples.set_shape(inferred_shape)

        return samples * self._sigma + self._mu
Ejemplo n.º 50
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          learning_rate, noisy_linear_cosine_decayed, name=name)
Ejemplo n.º 51
0
  def __call__(self, step):
    with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)
      num_periods = math_ops.cast(self.num_periods, dtype)
      alpha = math_ops.cast(self.alpha, dtype)
      beta = math_ops.cast(self.beta, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
      return math_ops.multiply(initial_learning_rate, linear_cosine_decayed,
                               name=name)
Ejemplo n.º 52
0
  def __call__(self, step):
    with ops.name_scope_v2(self.name or "SGDRDecay") as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
      alpha = math_ops.cast(self.alpha, dtype)
      t_mul = math_ops.cast(self._t_mul, dtype)
      m_mul = math_ops.cast(self._m_mul, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      completed_fraction = global_step_recomp / first_decay_steps

      def compute_step(completed_fraction, geometric=False):
        """Helper for `cond` operation."""
        if geometric:
          i_restart = math_ops.floor(
              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
              math_ops.log(t_mul))

          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart

        else:
          i_restart = math_ops.floor(completed_fraction)
          completed_fraction -= i_restart

        return i_restart, completed_fraction

      i_restart, completed_fraction = control_flow_ops.cond(
          math_ops.equal(t_mul, 1.0),
          lambda: compute_step(completed_fraction, geometric=False),
          lambda: compute_step(completed_fraction, geometric=True))

      m_fac = m_mul**i_restart
      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))
      decayed = (1 - alpha) * cosine_decayed + alpha

      return math_ops.multiply(initial_learning_rate, decayed, name=name)
Ejemplo n.º 53
0
  def decayed_lr(learning_rate, global_step, decay_steps, num_periods, alpha,
                 beta, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "LinearCosineDecay",
                        [learning_rate, global_step]) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      decay_steps = math_ops.cast(decay_steps, dtype)
      num_periods = math_ops.cast(num_periods, dtype)
      alpha = math_ops.cast(alpha, dtype)
      beta = math_ops.cast(beta, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
Ejemplo n.º 54
0
def get_multi_engine_graph_def(mode="FP32"):
  """Create a simple graph and return its graph_def."""
  dtype = dtypes.float32
  if mode.upper() == "FP16":
    dtype = dtypes.float16
  else:
    pass

  g = ops.Graph()
  with g.as_default():
    x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
    with g.name_scope("Global_scope"):
      with g.name_scope("first_scope"):
        e = cop.constant(
            np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
        conv = nn.conv2d(
            input=x,
            filter=e,
            data_format="NCHW",
            strides=[1, 1, 1, 1],
            padding="VALID",
            name="conv")
        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype)
        t = conv * b

        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
        q = conv / b
      edge = mops.sin(q)
      edge1 = mops.cos(conv)
      with g.name_scope("test_scope"):
        de = edge + edge1
        t -= edge1
        q *= edge
        t += q
        t -= de
    k = aops.squeeze(t, name="output")
  print(k.dtype)
  return g.as_graph_def()
Ejemplo n.º 55
0
    def _sine_discontinuity(value):
      """A special case for dealing with discontinuities.

      Decides whether `value`  is close to an integer, and if so computes:

        lim x->n |sin(x * pi)| / sin(x * pi) = sign(sin(n * pi))
                                             = cos(n * pi)

      Args:
        value: The floating point Tensor value which may lead to a
            discontinuity.
      Returns:
        A tuple of (is_discontinuous, sign):
          is_discontinuous: A boolean Tensor of the same shape as `value`,
              indicating whether it is near an integer.
          sign: A floating point Tensor indicating the sign of the discontinuity
            (being near 1 or -1 when `is_discontinuous` is True), of the same
            shape and type as `value`.
      """
      normalized = value / num_latent_values_float
      is_discontinuous = self._close_to_integer(normalized)
      sign = math_ops.cos(normalized * numpy.pi)
      return is_discontinuous, sign
Ejemplo n.º 56
0
def _SinGrad(op, grad):
  """Returns grad * cos(x)."""
  x = op.inputs[0]
  with ops.control_dependencies([grad]):
    x = math_ops.conj(x)
    return grad * math_ops.cos(x)
Ejemplo n.º 57
0
def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
                              initial_variance=1.0, variance_decay=0.55,
                              num_periods=0.5, alpha=0.0, beta=0.001,
                              name=None):
  """Applies noisy linear cosine decay to the learning rate.

  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
  https://arxiv.org/abs/1709.07417

  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a noisy linear
  cosine decay function to a provided initial learning rate.
  It requires a `global_step` value to compute the decayed learning rate.
  You can just pass a TensorFlow variable that you increment at each
  training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  linear_decay = (decay_steps - global_step) / decay_steps)
  cosine_decay = 0.5 * (
      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
  decayed_learning_rate = learning_rate * decayed
  ```
  where eps_t is 0-centered gaussian noise with variance
  initial_variance / (1 + global_step) ** variance_decay

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = noisy_linear_cosine_decay(
    learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
    initial_variance: initial variance for the noise. See computation above.
    variance_decay: decay for the noise's variance. See computation above.
    num_periods: Number of periods in the cosine part of the decay.
      See computation above.
    alpha: See computation above.
    beta: See computation above.
    name: String.  Optional name of the operation.  Defaults to
      'NoisyLinearCosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("noisy linear cosine decay requires global_step")
  with ops.name_scope(name, "NoisyLinearCosineDecay",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    global_step = math_ops.minimum(global_step, decay_steps)
    initial_variance = math_ops.cast(initial_variance, dtype)
    variance_decay = math_ops.cast(variance_decay, dtype)
    num_periods = math_ops.cast(num_periods, dtype)
    alpha = math_ops.cast(alpha, dtype)
    beta = math_ops.cast(beta, dtype)

    linear_decayed = (decay_steps - global_step) / decay_steps
    variance = initial_variance / (
        math_ops.pow(1.0 + global_step, variance_decay))
    std = math_ops.sqrt(variance)
    noisy_linear_decayed = (
        linear_decayed + random_ops.random_normal(
            linear_decayed.shape, stddev=std))

    completed_fraction = global_step / decay_steps
    fraction = 2.0 * num_periods * completed_fraction
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
    noisy_linear_cosine_decayed = (
        (alpha + noisy_linear_decayed) * cosine_decayed + beta)

    return math_ops.multiply(
        learning_rate, noisy_linear_cosine_decayed, name=name)