Esempio n. 1
0
    def _testTypesForAdam(self, var, m, v, grad, use_gpu):
        self.setUp()
        with self.test_session(use_gpu=use_gpu):
            var_t = variables.Variable(var)
            m_t = variables.Variable(m)
            v_t = variables.Variable(v)

            t = 1
            beta1 = np.array(0.9, dtype=var.dtype)
            beta2 = np.array(0.999, dtype=var.dtype)
            beta1_power = beta1**t
            beta2_power = beta2**t
            lr = np.array(0.001, dtype=var.dtype)
            epsilon = np.array(1e-8, dtype=var.dtype)
            beta1_t = constant_op.constant(beta1, self._toType(var.dtype), [])
            beta2_t = constant_op.constant(beta2, self._toType(var.dtype), [])
            beta1_power_t = variables.Variable(beta1_power)
            beta2_power_t = variables.Variable(beta2_power)
            lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
            epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype),
                                             [])
            variables.initialize_all_variables().run()

            self.assertAllEqual(var, var_t.eval())
            new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr,
                                                  beta1, beta2, epsilon)
            apply_adam = training_ops.apply_adam(var_t, m_t, v_t,
                                                 beta1_power_t, beta2_power_t,
                                                 lr_t, beta1_t, beta2_t,
                                                 epsilon_t, grad)
            out = apply_adam.eval()
            self.assertShapeEqual(out, apply_adam)
            self.assertAllClose(new_var, out)
Esempio n. 2
0
 def _apply_dense(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   return training_ops.apply_adam(
       var, m, v, self._beta1_power, self._beta2_power,
       self._lr_t, self._beta1_t, self._beta2_t,
       self._epsilon_t, grad, use_locking=self._use_locking).op
  def _testTypesForAdam(self, var, m, v, grad, use_gpu):
    self.setUp()
    with self.test_session(use_gpu=use_gpu):
      var_t = variables.Variable(var)
      m_t = variables.Variable(m)
      v_t = variables.Variable(v)

      t = 1
      beta1 = np.array(0.9, dtype=var.dtype)
      beta2 = np.array(0.999, dtype=var.dtype)
      beta1_power = beta1**t
      beta2_power = beta2**t
      lr = np.array(0.001, dtype=var.dtype)
      epsilon = np.array(1e-8, dtype=var.dtype)
      beta1_t = constant_op.constant(beta1, self._toType(var.dtype), [])
      beta2_t = constant_op.constant(beta2, self._toType(var.dtype), [])
      beta1_power_t = variables.Variable(beta1_power)
      beta2_power_t = variables.Variable(beta2_power)
      lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
      epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
      variables.global_variables_initializer().run()

      self.assertAllCloseAccordingToType(var, var_t.eval())
      new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
                                            beta2, epsilon)
      apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t,
                                           beta2_power_t, lr_t, beta1_t,
                                           beta2_t, epsilon_t, grad)
      out = apply_adam.eval()
      self.assertShapeEqual(out, apply_adam)
      self.assertAllCloseAccordingToType(new_var, out)
Esempio n. 4
0
    def _apply_dense(self, grad, var):
        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        # for BNN kernel
        # origin version clipping weight method is new_w = old_w + scale*(new_w - old_w)
        # and adam update function is new_w = old_w - lr_t * m_t / (sqrt(v_t) + epsilon)
        # so subtitute adam function into weight clipping
        # new_w = old_w - (scale * lr_t * m_t) / (sqrt(v_t) + epsilon)
        scale = self._weight_scale[var.name] / 4

        return training_ops.apply_adam(var,
                                       m,
                                       v,
                                       math_ops.cast(self._beta1_power,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._beta2_power,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._lr_t * scale,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._beta1_t,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._beta2_t,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._epsilon_t,
                                                     var.dtype.base_dtype),
                                       grad,
                                       use_locking=self._use_locking).op
Esempio n. 5
0
 def _apply_dense(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   return training_ops.apply_adam(
       var, m, v, self._beta1_power, self._beta2_power,
       self._lr_t, self._beta1_t, self._beta2_t,
       self._epsilon_t, grad, use_locking=self._use_locking).op
    def _apply_dense(self, grad, var):
        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")
        beta1_power, beta2_power = self._get_beta_accumulators()

        clip_bounds = 3 * tf.sqrt(v / (1 - beta2_power)) + 0.1
        grad = tf.clip_by_value(grad, -clip_bounds, clip_bounds)
        # Clip gradients by 3 std
        return training_ops.apply_adam(var,
                                       m,
                                       v,
                                       math_ops.cast(beta1_power,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(beta2_power,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._lr_t,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._beta1_t,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._beta2_t,
                                                     var.dtype.base_dtype),
                                       math_ops.cast(self._epsilon_t,
                                                     var.dtype.base_dtype),
                                       grad,
                                       use_locking=self._use_locking).op
Esempio n. 7
0
    def _apply_dense(self, grad, var):
        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")
        beta1_power, beta2_power, _ = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)

        return tf.cond(
            self.condition,
            lambda: training_ops.apply_adam(
                var,
                m,
                v,
                beta1_power,
                math_ops.cast(beta2_power, var.dtype.base_dtype),
                math_ops.cast(self.rectified_lr, var.dtype.base_dtype
                              ),  # instead of _lr_t
                beta1_t,
                beta2_t,
                math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
                grad,
                use_locking=self._use_locking).op,
            lambda: self._apply_dense_without_v(var, m, v, beta1_power,
                                                beta1_t, beta2_t, grad),
        )
Esempio n. 8
0
 def _apply_dense(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   return training_ops.apply_adam(
       var, m, v,
       math_ops.cast(self._beta1_power, var.dtype.base_dtype),
       math_ops.cast(self._beta2_power, var.dtype.base_dtype),
       math_ops.cast(self._lr_t, var.dtype.base_dtype),
       math_ops.cast(self._beta1_t, var.dtype.base_dtype),
       math_ops.cast(self._beta2_t, var.dtype.base_dtype),
       math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
       grad, use_locking=self._use_locking).op
Esempio n. 9
0
 def _apply_dense(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   return training_ops.apply_adam(
       var, m, v,
       math_ops.cast(self._beta1_power, var.dtype.base_dtype),
       math_ops.cast(self._beta2_power, var.dtype.base_dtype),
       math_ops.cast(self._lr_t, var.dtype.base_dtype),
       math_ops.cast(self._beta1_t, var.dtype.base_dtype),
       math_ops.cast(self._beta2_t, var.dtype.base_dtype),
       math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
       grad, use_locking=self._use_locking).op
Esempio n. 10
0
 def _apply_dense(self, grad, var, state):
   m = state.get_slot(var, "m")
   v = state.get_slot(var, "v")
   beta1_power, beta2_power = self._get_beta_accumulators(state)
   return training_ops.apply_adam(
       var, m, v,
       math_ops.cast(beta1_power, var.dtype.base_dtype),
       math_ops.cast(beta2_power, var.dtype.base_dtype),
       state.get_hyper("learning_rate", var.dtype.base_dtype),
       state.get_hyper("beta1", var.dtype.base_dtype),
       state.get_hyper("beta2", var.dtype.base_dtype),
       state.get_hyper("epsilon", var.dtype.base_dtype),
       grad, use_locking=self._use_locking).op
Esempio n. 11
0
 def _apply_dense(self, grad, var, state):
   m = state.get_slot(var, "m")
   v = state.get_slot(var, "v")
   beta1_power, beta2_power = self._get_beta_accumulators(state)
   return training_ops.apply_adam(
       var, m, v,
       math_ops.cast(beta1_power, var.dtype.base_dtype),
       math_ops.cast(beta2_power, var.dtype.base_dtype),
       state.get_hyper("learning_rate", var.dtype.base_dtype),
       state.get_hyper("beta1", var.dtype.base_dtype),
       state.get_hyper("beta2", var.dtype.base_dtype),
       state.get_hyper("epsilon", var.dtype.base_dtype),
       grad, use_locking=self._use_locking).op
Esempio n. 12
0
 def _apply_dense_in_action(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   beta1_power, beta2_power = self._get_beta_accumulators()
   return training_ops.apply_adam(
       var,
       m,
       v,
       tf.cast(beta1_power, var.dtype.base_dtype),
       tf.cast(beta2_power, var.dtype.base_dtype),
       tf.cast(self._lr_t, var.dtype.base_dtype),
       tf.cast(self._beta1_t, var.dtype.base_dtype),
       tf.cast(self._beta2_t, var.dtype.base_dtype),
       tf.cast(self._epsilon_t, var.dtype.base_dtype),
       grad,
       use_locking=self._use_locking).op
Esempio n. 13
0
 def _apply_dense(self, grad, var):
   m = self.get_slot(var, "m")
   v = self.get_slot(var, "v")
   beta1_power, beta2_power = self._get_beta_accumulators()
   return training_ops.apply_adam(
       var,
       m,
       v,
       math_ops.cast(beta1_power, var.dtype.base_dtype),
       math_ops.cast(beta2_power, var.dtype.base_dtype),
       math_ops.cast(self._lr_t, var.dtype.base_dtype),
       math_ops.cast(self._beta1_t, var.dtype.base_dtype),
       math_ops.cast(self._beta2_t, var.dtype.base_dtype),
       math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
       grad,
       use_locking=self._use_locking,
       use_nesterov=True).op
 def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     ops = self._get_ops_tester()
     ops_up = ops.assign_add(1)
     return control_flow_ops.group(*[
         training_ops.apply_adam(
             var,
             m,
             v,
             math_ops.cast(beta1_power, var.dtype.base_dtype),
             math_ops.cast(beta2_power, var.dtype.base_dtype),
             math_ops.cast(self._lr_t, var.dtype.base_dtype),
             math_ops.cast(self._beta1_t, var.dtype.base_dtype),
             math_ops.cast(self._beta2_t, var.dtype.base_dtype),
             math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op, ops_up
     ])
Esempio n. 15
0
 def adam_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     #grad = tf.Print(grad,["A"])
     return training_ops.apply_adam(var,
                                    m,
                                    v,
                                    math_ops.cast(beta1_power,
                                                  var.dtype.base_dtype),
                                    math_ops.cast(beta2_power,
                                                  var.dtype.base_dtype),
                                    math_ops.cast(self._lr_t,
                                                  var.dtype.base_dtype),
                                    math_ops.cast(self._beta1_t,
                                                  var.dtype.base_dtype),
                                    math_ops.cast(self._beta2_t,
                                                  var.dtype.base_dtype),
                                    math_ops.cast(self._epsilon_t,
                                                  var.dtype.base_dtype),
                                    grad,
                                    use_locking=self._use_locking).op
Esempio n. 16
0
 def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     decayed_var = var
     if self._do_use_weight_decay(self._get_variable_name(var.name)):
         decayed_var = self._weight_decay_rate * var
     return training_ops.apply_adam(decayed_var,
                                    m,
                                    v,
                                    tf.cast(self._beta1,
                                            var.dtype.base_dtype),
                                    tf.cast(self._beta2,
                                            var.dtype.base_dtype),
                                    tf.cast(self._learning_rate,
                                            var.dtype.base_dtype),
                                    tf.cast(self._beta1,
                                            var.dtype.base_dtype),
                                    tf.cast(self._beta2,
                                            var.dtype.base_dtype),
                                    tf.cast(self._epsilon,
                                            var.dtype.base_dtype),
                                    grad,
                                    use_locking=self._use_locking).op