Esempio n. 1
0
    def _resource_apply_dense(self, grad, var, apply_state=None):
        # pylint: disable=no-name-in-module,import-error
        from tensorflow.python.training import training_ops

        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)) or self._fallback_apply_state(
                var_device, var_dtype)

        if var.name in self.lr_multipliers:
            lr_t = coefficients["lr_t"] * self.lr_multipliers[var.name]
        else:
            lr_t = coefficients["lr_t"]

        if self._momentum:
            momentum_var = self.get_slot(var, "momentum")
            return training_ops.resource_apply_keras_momentum(
                var.handle,
                momentum_var.handle,
                lr_t,
                grad,
                coefficients["momentum"],
                use_locking=self._use_locking,
                use_nesterov=self.nesterov,
            )
        else:
            return training_ops.resource_apply_gradient_descent(
                var.handle, lr_t, grad, use_locking=self._use_locking)
Esempio n. 2
0
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))
        var_dtype = var.dtype.base_dtype
        alpha = self._get_hyper('learning_rate', var_dtype)

        if self._is_first:
            momentum = grad
            H_inv = tf.ones(var.shape, var_dtype)
            self.get_slot(var, 'dh').assign(H_inv)
            self._is_first = False
        else:
            momentum = self.get_slot(var, 'g_mom')
            momentum_new = self._get_hyper(
                'momentum', var_dtype) * momentum + (
                    1 - self._get_hyper('momentum', var_dtype)) * grad
            momentum.assign(momentum_new)

            dh = self.get_slot(var, 'dh')
            ones = tf.ones(var.shape)
            H_inv = tf.maximum(
                tf.math.reciprocal_no_nan(dh),
                tf.multiply(ones,
                            self._get_hyper('epsilon', var_dtype) / alpha))
            H_inv = tf.minimum(
                H_inv,
                tf.multiply(ones,
                            self._get_hyper('gamma', var_dtype) / alpha))
        update = alpha * H_inv * momentum
        self.get_slot(var, 'update').assign(update)

        return training_ops.resource_apply_gradient_descent(
            var.handle, 1.0, update, use_locking=self._use_locking)
Esempio n. 3
0
    def _resource_apply_dense(self, grad, var, apply_state=None):
        # # here is the change from SGD with momentum
        # print(f"orig var: {var}")
        K.set_value(var, K.clip(var, min_value=-1.0, max_value=1.0))
        # print(f"set var: {var}")

        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))

        if self._momentum:
            momentum_var = self.get_slot(var, "momentum")
            return training_ops.resource_apply_keras_momentum(
                var.handle,
                momentum_var.handle,
                coefficients["lr_t"],
                grad,
                coefficients["momentum"],
                use_locking=self._use_locking,
                use_nesterov=self.nesterov)
        else:
            return training_ops.resource_apply_gradient_descent(
                var.handle,
                coefficients["lr_t"],
                grad,
                use_locking=self._use_locking)
Esempio n. 4
0
 def _resource_apply_dense(self, grad, var):
     return training_ops.resource_apply_gradient_descent(
         var.handle,
         math_ops.cast(self._get_hyper("learning_rate"),
                       var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking)
 def _resource_apply_dense(self, grad, handle):
     return training_ops.resource_apply_gradient_descent(
         handle.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
         grad,
         use_locking=self._use_locking,
     )
Esempio n. 6
0
 def _resource_apply_dense(self, grad, var):
     rms = self.get_slot(var, 'rms')
     new_grad = self._apply_noisy_update(rms, grad, var)
     return training_ops.resource_apply_gradient_descent(
         var.handle,
         tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         new_grad,
         use_locking=self._use_locking)
Esempio n. 7
0
 def _resource_apply_dense(self, grad, var):
     if 'batch_norm/moving_' in var.name:
         # pose update as a gradient descent update to ensure compatibility with
         # gradient accumulation v1 op
         return training_ops.resource_apply_gradient_descent(
             var.handle,
             math_ops.cast(self._update_step, grad.dtype.base_dtype),
             grad,
             use_locking=self._use_locking)
     else:
         return super()._resource_apply_dense(grad, var)
Esempio n. 8
0
 def _resource_apply_dense(self, grad, var, state):
     if self._use_momentum:
         mom = state.get_slot(var, "momentum")
         return training_ops.resource_apply_momentum(
             var.handle,
             mom.handle,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             grad,
             state.get_hyper("momentum", var.dtype.base_dtype),
             use_locking=self._use_locking,
             use_nesterov=self._use_nesterov)
     else:
         lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
         return training_ops.resource_apply_gradient_descent(
             var.handle, lr, grad, use_locking=self._use_locking)
Esempio n. 9
0
 def _resource_apply_dense(self, grad, var, state):
   if self._use_momentum:
     mom = state.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         grad,
         state.get_hyper("momentum", var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
   else:
     lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
     return training_ops.resource_apply_gradient_descent(
         var.handle, lr, grad, use_locking=self._use_locking)
Esempio n. 10
0
  def _resource_apply_dense(self, grad, var):
    max_learning_rate = tf.where(
        self.iterations < tf.cast(self._burnin, tf.int64),
        self._burnin_max_learning_rate, self._max_learning_rate)

    learn_rates = tf.clip_by_value(
        self._get_coordinatewise_learning_rate(grad, var), 0.,
        tf.cast(max_learning_rate, var.dtype.base_dtype))

    newgrad = grad * learn_rates
    return training_ops.resource_apply_gradient_descent(
        var.handle,
        tf.cast(1., var.dtype),
        newgrad,
        use_locking=self._use_locking)
Esempio n. 11
0
 def _resource_apply_dense(self, grad, var):
   var_dtype = var.dtype.base_dtype
   lr_t = self._decayed_lr(var_dtype)
   if self._momentum:
     momentum_var = self.get_slot(var, "momentum")
     return training_ops.resource_apply_keras_momentum(
         var.handle,
         momentum_var.handle,
         lr_t,
         grad,
         self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
         use_nesterov=self.nesterov)
   else:
     return training_ops.resource_apply_gradient_descent(
         var.handle, lr_t, grad, use_locking=self._use_locking)
Esempio n. 12
0
 def _resource_apply_dense(self, grad, var):
   var_dtype = var.dtype.base_dtype
   lr_t = self._decayed_lr(var_dtype)
   if self._momentum:
     momentum_var = self.get_slot(var, "momentum")
     return training_ops.resource_apply_keras_momentum(
         var.handle,
         momentum_var.handle,
         lr_t,
         grad,
         self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
         use_nesterov=self.nesterov)
   else:
     return training_ops.resource_apply_gradient_descent(
         var.handle, lr_t, grad, use_locking=self._use_locking)
  def _resource_apply_dense(self, grad, var, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    if self._momentum:
      momentum_var = self.get_slot(var, "momentum")
      return training_ops.resource_apply_keras_momentum(
          var.handle,
          momentum_var.handle,
          coefficients["lr_t"],
          grad,
          coefficients["momentum"],
          use_locking=self._use_locking,
          use_nesterov=self.nesterov)
    else:
      return training_ops.resource_apply_gradient_descent(
          var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
 def _resource_apply_dense(self, grad, var):
   learning_rate = self._get_hyper("learning_rate")
   if self._momentum:
     momentum_var = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         momentum_var.handle,
         math_ops.cast(learning_rate, grad.dtype.base_dtype),
         grad,
         math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._nesterov)
   else:
     return training_ops.resource_apply_gradient_descent(
         var.handle,
         math_ops.cast(learning_rate, grad.dtype.base_dtype),
         grad,
         use_locking=self._use_locking)
Esempio n. 15
0
 def _resource_apply_dense(self, grad, var):
     learning_rate = self._get_hyper("learning_rate")
     if self._momentum:
         momentum_var = self.get_slot(var, "momentum")
         return training_ops.resource_apply_momentum(
             var.handle,
             momentum_var.handle,
             math_ops.cast(learning_rate, grad.dtype.base_dtype),
             grad,
             math_ops.cast(self._get_hyper("momentum"),
                           grad.dtype.base_dtype),
             use_locking=self._use_locking,
             use_nesterov=self._nesterov)
     else:
         return training_ops.resource_apply_gradient_descent(
             var.handle,
             math_ops.cast(learning_rate, grad.dtype.base_dtype),
             grad,
             use_locking=self._use_locking)
Esempio n. 16
0
 def _resource_apply_dense(self, grad, var, apply_state=None):
     #print("_resource_apply_dense")
     variable_name = var.name
     var_device, var_dtype = var.device, var.dtype.base_dtype
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                                     or self._fallback_apply_state(var_device, var_dtype))
     
     if self._alpha_func != None:
         training_ops.resource_apply_gradient_descent(
                 self._alpha_dict[variable_name].handle, tf.constant(1.0), self._alpha_func(var.shape, self._alpha_dict[variable_name], grad), use_locking=self._use_locking)
         if self._beta_func != None:
             training_ops.resource_apply_gradient_descent(
                     self._beta_dict[variable_name].handle, tf.constant(1.0), self._beta_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], grad), use_locking=self._use_locking)
             if self._sigma_func!= None:
                 training_ops.resource_apply_gradient_descent(
                         self._sigma_dict[variable_name].handle, tf.constant(1.0), self._sigma_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], self._sigma_dict[variable_name], grad), use_locking=self._use_locking)
     
     foo = training_ops.resource_apply_gradient_descent(
             var.handle, tf.constant(1.0), self._grad_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], self._sigma_dict[variable_name], grad), use_locking=self._use_locking)
     return foo
Esempio n. 17
0
    def _resource_apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.resource_apply_momentum(
            var.handle,
            momentum_buffer.handle,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        )

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.resource_apply_gradient_descent(
                var.handle, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            )

        return control_flow_ops.group(momentum_op, gd_op)
Esempio n. 18
0
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(
            var_device, var_dtype
        )

        if self._momentum:
            momentum_var = self.get_slot(var, "momentum")
            return training_ops.resource_apply_keras_momentum(
                var.handle,
                momentum_var.handle,
                coefficients["lr_t"],
                grad,
                coefficients["momentum"],
                use_locking=self._use_locking,
                use_nesterov=self.nesterov,
            )
        else:
            lr = coefficients["lr_t"]
            if str(var.name).find("transpose") != -1:
                lr = constant(self._serialize_hyperparameter("learning_rate_deconv"))
            return training_ops.resource_apply_gradient_descent(var.handle, lr, grad, use_locking=self._use_locking)
Esempio n. 19
0
        def apply_proxy_gradients(self, proxy_grads_and_vars):
            lengths = [
                v.get_shape().as_list()[-1] for g, v in proxy_grads_and_vars
            ]
            grads, vars = zip(*proxy_grads_and_vars)
            with ops.init_scope():
                self._create_slots(vars)
            all_grads = tf.concat(grads, axis=-1)
            all_vars = tf.concat(vars, axis=-1, name='Proxy_concat')

            rand = tf.constant(get_rand(), dtype=self._dtype)
            rand = tf.reshape(rand, shape=(1, 1, 1, 1, -1))

            all_grads = self.proxy_bw(all_vars, all_grads, rand)
            grads = tf.split(all_grads, lengths, axis=-1)
            updated_vars = []
            for grad, var in zip(grads, vars):
                updated_var = super()._apply_weight_update(grad, var)
                updated_vars.append(updated_var)
            updated_vars = tf.concat(updated_vars, axis=-1)

            # do proxy forward pass for the next step
            updated_vars = self.proxy_fw(updated_vars, rand)
            updated_vars = tf.split(updated_vars, lengths, axis=-1)

            grads_and_vars = [(v - up_v, v)
                              for v, up_v in zip(vars, updated_vars)]

            return tf.group([
                training_ops.resource_apply_gradient_descent(
                    var.handle,
                    tf.constant(1.0, grad.dtype.base_dtype),
                    grad,
                    use_locking=self._use_locking)
                for grad, var in grads_and_vars
            ])
Esempio n. 20
0
 def _resource_apply_dense(self, grad, var):
   return training_ops.resource_apply_gradient_descent(
       var.handle,
       math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype),
       grad,
       use_locking=self._use_locking)
Esempio n. 21
0
 def _resource_apply_dense(self, grad, handle, state):
     lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
     return training_ops.resource_apply_gradient_descent(
         handle.handle, lr, grad, use_locking=self._use_locking)
Esempio n. 22
0
 def _wrapWOAccu(accuGrads, grad, var, apply_state):
     return training_ops.resource_apply_gradient_descent(
         var.handle, 0.0, grad * 0.0, use_locking=self._use_locking)
Esempio n. 23
0
 def _resource_apply_dense(self, grad, handle):
   return training_ops.resource_apply_gradient_descent(
       handle.handle, math_ops.cast(self._learning_rate_tensor,
                                    grad.dtype.base_dtype),
       grad, use_locking=self._use_locking)
Esempio n. 24
0
 def _resource_apply_dense(self, grad, handle, state):
   lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
   return training_ops.resource_apply_gradient_descent(
       handle.handle, lr, grad, use_locking=self._use_locking)
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype

        return training_ops.resource_apply_gradient_descent(
                var.handle, 1.0, grad, use_locking=self._use_locking)