Esempio n. 1
0
    def _apply_dense(self, grad, var):
        var_name = var.name.replace(':', '_')
        with tf.variable_scope('apply_dense/{}'.format(var_name)):

            gradient = self._gradient[var.name]
            gradient.update_slots(self, var)

            grad_flat = tf.reshape(grad, [-1])
            gradient_ops = gradient.update_statistics(grad_flat)

            grad_apply, assign_ops = gradient.compute_apply(gradient_ops,
                                                            {})

            grad_apply = tf.reshape(grad_apply, grad.get_shape())
            if assign_ops:
                with tf.control_dependencies(assign_ops):
                    update_ops = training_ops.apply_gradient_descent(
                        var,
                        math_ops.cast(1.0, var.dtype.base_dtype),
                        grad_apply,
                        use_locking=self._use_locking).op
            else:
                update_ops = training_ops.apply_gradient_descent(
                    var,
                    math_ops.cast(1.0, var.dtype.base_dtype),
                    grad_apply,
                    use_locking=self._use_locking).op
            return update_ops
Esempio n. 2
0
 def _apply_dense(self, grad, var):
     grad = self._fft_solver(grad, var.name)
     return training_ops.apply_gradient_descent(
         var,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
Esempio n. 3
0
    def _apply_dense(self, grad, var):
        lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        update_op = training_ops.apply_gradient_descent(
            var, lr, grad, use_locking=self._use_locking).op

        if self._flow.edgename_map.get(var.op.name):
            with ops.control_dependencies([update_op]):
                key = self._flow.edgename_map[var.op.name]
                if self._flow.flow:
                    threshold = math_ops.cast(
                        ops.convert_to_tensor(
                            self._flow.flow[key] * self._learning_rate_tensor *
                            self._group_lasso_strength_tensor),
                        var.dtype.base_dtype)
                else:
                    threshold = math_ops.cast(
                        ops.convert_to_tensor(
                            self._learning_rate_tensor *
                            self._group_lasso_strength_tensor),
                        var.dtype.base_dtype)

                norm = math_ops.maximum(math_ops.abs(var), 1E-16)
                mask = math_ops.maximum(1.0 - (threshold / norm), 0.)
                new_var = math_ops.multiply(var, mask)
                shrinkage = state_ops.assign(var, new_var)
            return shrinkage
        else:
            return update_op
Esempio n. 4
0
 def _apply_dense(self, grad, var):
     return training_ops.apply_gradient_descent(
         var,
         math_ops.cast(self._get_hyper("learning_rate"),
                       var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
Esempio n. 5
0
 def _apply_dense(self, grad, var):
     lr = self._learning_rate_tensor
     grad_clipped = tf.minimum(var/lr, grad)
     return training_ops.apply_gradient_descent(
         var,
         self._learning_rate_tensor,
         grad_clipped,
         use_locking=self._use_locking).op
Esempio n. 6
0
 def _apply_sparse(self, grad, var):
   rms = self.get_slot(var, 'rms')
   new_grad = self._apply_noisy_update(rms, grad, var)
   return training_ops.apply_gradient_descent(
       var,
       tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
       new_grad,
       use_locking=self._use_locking).op
Esempio n. 7
0
 def _apply_sparse(self, grad, var):
     rms = self.get_slot(var, 'rms')
     new_grad = self._apply_noisy_update(rms, grad, var)
     return training_ops.apply_gradient_descent(
         var,
         tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         new_grad,
         use_locking=self._use_locking).op
Esempio n. 8
0
 def _testTypes(self, x, alpha, delta, use_gpu=None):
     self.setUp()
     with self.session(use_gpu=use_gpu):
         var = variables.VariableV1(x)
         variables.global_variables_initializer().run()
         self.assertAllCloseAccordingToType(x, self.evaluate(var))
         apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
         out = self.evaluate(apply_sgd)
         self.assertShapeEqual(out, apply_sgd)
         self.assertAllCloseAccordingToType(x - alpha * delta, out)
Esempio n. 9
0
 def _testTypes(self, x, alpha, delta, use_gpu=None):
     self.setUp()
     with self.test_session(use_gpu=use_gpu):
         var = variables.Variable(x)
         variables.initialize_all_variables().run()
         self.assertAllEqual(x, var.eval())
         apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
         out = apply_sgd.eval()
         self.assertShapeEqual(out, apply_sgd)
         self.assertAllEqual(x - alpha * delta, out)
Esempio n. 10
0
 def _testTypes(self, x, alpha, delta, use_gpu=None):
   self.setUp()
   with self.test_session(use_gpu=use_gpu):
     var = variables.Variable(x)
     variables.global_variables_initializer().run()
     self.assertAllCloseAccordingToType(x, var.eval())
     apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
     out = apply_sgd.eval()
     self.assertShapeEqual(out, apply_sgd)
     self.assertAllCloseAccordingToType(x - alpha * delta, out)
Esempio n. 11
0
  def _apply_sparse(self, grad, var):
    rms = self.get_slot(var, 'rms')

    with ops.control_dependencies([
        self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
                                                       var.dtype.base_dtype))]):
      new_grad = self._apply_noisy_update(rms, grad)

    return training_ops.apply_gradient_descent(
        var,
        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
        new_grad,
        use_locking=self._use_locking).op
Esempio n. 12
0
    def _apply_dense(self, grad, var):
        previous_grad = self.get_slot(var, "previous_grad")
        lr = self.get_slot(var, "learning_rate")
        scale_factor = tf.pow(self._scale_tensor,
                              tf.sign(grad * previous_grad))
        lr_update = lr.assign(lr * scale_factor)
        with tf.control_dependencies([lr_update]):
            previous_grad_update = previous_grad.assign(grad)
            with tf.control_dependencies([previous_grad_update]):
                apply_grad_op = training_ops.apply_gradient_descent(
                    var, 1.0, lr * grad, use_locking=self._use_locking).op

        return apply_grad_op
Esempio n. 13
0
 def _apply_dense(self, grad, var):
     if self.quantizer is None:
         return training_ops.apply_gradient_descent(
             var,
             math_ops.cast(self._learning_rate_tensor,
                           var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
     else:
         lr = math_ops.cast(self._learning_rate_tensor,
                            var.dtype.base_dtype)
         delta = self.quantizer.quantize(
             self.quantizer.quantize(grad) * self.quantizer.quantize(lr))
         new_var = self.quantizer.quantize(var - delta)
         return var.assign(new_var).op
Esempio n. 14
0
  def _apply_dense(self, grad, var):
    max_learning_rate = tf.where(self._counter < self._burnin,
                                 self._burnin_max_learning_rate,
                                 self._max_learning_rate)

    learn_rates = tf.clip_by_value(
        self._get_coordinatewise_learning_rate(grad, var), 0.,
        tf.cast(max_learning_rate, var.dtype.base_dtype))

    newgrad = grad * learn_rates
    return training_ops.apply_gradient_descent(
        var,
        tf.cast(1., var.dtype),
        newgrad,
        use_locking=self._use_locking).op
Esempio n. 15
0
  def _apply_dense(self, grad, var):
    max_learning_rate = tf.where(self._counter < self._burnin,
                                 self._burnin_max_learning_rate,
                                 self._max_learning_rate)

    learn_rates = tf.clip_by_value(
        self._get_coordinatewise_learning_rate(grad, var), 0.,
        tf.cast(max_learning_rate, var.dtype.base_dtype))

    newgrad = grad * learn_rates
    return training_ops.apply_gradient_descent(
        var,
        tf.cast(1., var.dtype),
        newgrad,
        use_locking=self._use_locking).op
Esempio n. 16
0
 def _apply_dense(self, grad, var, state):
     if self._use_momentum:
         mom = state.get_slot(var, "momentum")
         return training_ops.apply_momentum(
             var,
             mom,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             grad,
             state.get_hyper("momentum", var.dtype.base_dtype),
             use_locking=self._use_locking,
             use_nesterov=self._use_nesterov).op
     else:
         return training_ops.apply_gradient_descent(
             var,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
Esempio n. 17
0
 def _apply_dense(self, grad, var, state):
   if self._use_momentum:
     mom = state.get_slot(var, "momentum")
     return training_ops.apply_momentum(
         var,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         grad,
         state.get_hyper("momentum", var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov).op
   else:
     return training_ops.apply_gradient_descent(
         var,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
Esempio n. 18
0
    def _apply_dense(self, grad, var):

        previous_grad = self.get_slot(var, "previous_grad")
        lr = self.get_slot(var, "learning_rate")
        scale_factor = tf.pow(self._scale_tensor,
                              tf.sign(grad * previous_grad))
        lr_update = lr.assign(lr * scale_factor)
        #streaming_lr_mean, streaming_lr_update = tf.contrib.metrics.streaming_mean(lr_update)
        #streaming_lr_scalar = tf.summary.scalar('lr_cost', streaming_lr_update)
        lr_scalar = tf.summary.scalar("learning rate/{}".format(var),
                                      tf.reduce_mean(lr * scale_factor))
        with tf.control_dependencies([lr_update]):
            previous_grad_update = previous_grad.assign(grad)
            with tf.control_dependencies([previous_grad_update]):
                apply_grad_op = training_ops.apply_gradient_descent(
                    var, 1.0, lr * grad, use_locking=self._use_locking).op

        return apply_grad_op
Esempio n. 19
0
    def _apply_dense(self, grad, var):
        step = self.get_slot(var, "step")
        step_t = step.assign(step + 1)

        mu = self.get_slot(var, "mu")
        ax = self.get_slot(var, "ax")

        var_t = training_ops.apply_gradient_descent(
            var,
            math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
            grad,
            use_locking=self._use_locking)

        var_T = var_t.op

        if mu != 1:
            ax_t = ax.assign(ax + (var_t - ax) * mu)
        else:
            ax_t = ax.assign(var_t)
        mu_t = mu.assign(
            1 / tf.maximum(math_ops.cast(1, step_t.dtype), step_t - self._t0))
        return control_flow_ops.group(*[var_T, step_t, ax_t, mu_t])
Esempio n. 20
0
    def _apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.apply_momentum(
            var,
            momentum_buffer,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        ).op

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.apply_gradient_descent(
                var, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            ).op

        return control_flow_ops.group(momentum_op, gd_op)
Esempio n. 21
0
 def _apply_dense(self, grad, var, state):
   return training_ops.apply_gradient_descent(
       var,
       state.get_hyper("learning_rate", var.dtype.base_dtype),
       grad,
       use_locking=self._use_locking).op
Esempio n. 22
0
 def _apply_dense(self, grad, var):
   return training_ops.apply_gradient_descent(
       var,
       self._learning_rate_tensor,
       grad,
       use_locking=self._use_locking).op
Esempio n. 23
0
 def _apply_dense(self, grad, var):
   return training_ops.apply_gradient_descent(
       var,
       math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
       grad,
       use_locking=self._use_locking).op
Esempio n. 24
0
 def _apply_dense(self, grad, var):
     return training_ops.apply_gradient_descent(
         var,
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
    def _apply_dense(self, grad, var):
        rms = self.get_slot(var, "rms")
        mom = self.get_slot(var, "momentum")
        eps = self.get_slot(var, 'eps')
        tf.summary.scalar('grad_norm', tf.norm(grad))
        # debug_here()
        if 'orthogonal_stiefel' in var.name and 'bias' not in var.name:
            with tf.variable_scope("orthogonal_update"):
                print('Appling an orthogonality preserving step to', var.name)
                # apply the rms update rule.
                new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \
                    * tf.square(grad)
                rms_assign_op = tf.assign(rms, new_rms)
                # scale the gradient.
                if self._nat_grad_normalization:
                    grad = grad / (tf.sqrt(rms) + eps)
                # the update should preserve orthogonality.
                grad_shape = tf.Tensor.get_shape(grad).as_list()
                # W_new_lst = []
                eye = tf.eye(grad_shape[0], dtype=tf.float32)
                G = grad
                W = var
                # Reunitarize after n steps.
                if self._qr_steps is not None:
                    W = tf.cond(tf.equal(tf.mod(self._global_step_tensor,
                                         self._qr_steps), 0),
                                lambda: self.re_unitarize(W), lambda: W)
                # A = tf.matmul(tf.transpose(G), W) - tf.matmul(tf.transpose(W), G)
                A = tf.matmul(G, tf.transpose(W)) - tf.matmul(W, tf.transpose(G))
                cayleyDenom = eye + (self._learning_rate_tensor/2.0) * A
                cayleyNumer = eye - (self._learning_rate_tensor/2.0) * A
                C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer)
                W_new = tf.matmul(C, W)
                if self._debug:
                    # self._summary_A(A)
                    self._summary_C(C)
                    self._summary_W(W)
                var_update_op = tf.assign(var, W_new)
                return tf.group(*[var_update_op, rms_assign_op])
        elif 'unitary_stiefel' in var.name and 'bias' not in var.name:
            with tf.variable_scope("unitary_update"):
                print('Appling an unitarity preserving step to', var.name)
                # apply the rms update rule.
                new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \
                    * tf.square(grad)
                rms_assign_op = tf.assign(rms, new_rms)
                # scale the gradient.
                if self._nat_grad_normalization:
                    grad = grad / (tf.sqrt(new_rms) + eps)
                # do an update step, which preserves unitary structure.
                # checking shapes.
                grad_shape = tf.Tensor.get_shape(grad).as_list()
                assert grad_shape[0] == grad_shape[1]
                eye = tf.eye(grad_shape[0], dtype=tf.complex64)
                G = tf.complex(grad[:, :, 0], grad[:, :, 1])
                W = tf.complex(var[:, :, 0], var[:, :, 1])

                # Reunitarize after n steps.
                if self._qr_steps is not None:
                    W = tf.cond(tf.equal(tf.mod(self._global_step_tensor,
                                         self._qr_steps), 0),
                                lambda: self.re_unitarize(W), lambda: W)

                A = tf.matmul(G, tf.conj(tf.transpose(W))) \
                    - tf.matmul(W, tf.conj(tf.transpose(G)))
                # A must be skew symmetric.
                larning_rate_scale = tf.complex(self._learning_rate_tensor/2.0,
                                                tf.zeros_like(self._learning_rate_tensor))
                cayleyDenom = eye + larning_rate_scale * A
                cayleyNumer = eye - larning_rate_scale * A
                C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer)
                W_new = tf.matmul(C, W)
                if self._debug:
                    # self._summary_A(A)
                    self._summary_C(C)
                    self._summary_W(W)
                # debug_here()
                W_new_re = tf.real(W_new)
                W_new_img = tf.imag(W_new)
                W_array = tf.stack([W_new_re, W_new_img], -1)
                var_update_op = tf.assign(var, W_array)
                return tf.group(*[var_update_op, rms_assign_op])
        else:
            # do the usual RMSprop update
            rms = False
            if rms:
                if 1:
                    # tensorflow default.
                    print('Appling standard rmsprop to', var.name)
                    return training_ops.apply_rms_prop(
                        var, rms, mom,
                        tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
                        tf.cast(self._decay_tensor, var.dtype.base_dtype),
                        tf.cast(self._momentum_tensor, var.dtype.base_dtype),
                        tf.cast(self._epsilon_tensor, var.dtype.base_dtype),
                        grad, use_locking=False).op
                else:
                    # My rmsprop implementation.
                    new_rms = self._decay_tensor * rms \
                        + (1. - self._decay_tensor) * tf.square(grad)
                    rms_assign_op = tf.assign(rms, new_rms)
                    W_new = var - self._learning_rate_tensor * grad \
                        / (tf.sqrt(new_rms) + eps)
                    var_update_op = tf.assign(var, W_new)
                    return tf.group(*[var_update_op, rms_assign_op])
            else:
                print('Appling default gradient descent to', var.name)
                return training_ops.apply_gradient_descent(
                    var,
                    tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
                    grad,
                    use_locking=False).op
Esempio n. 26
0
 def _apply_dense(self, grad, var):
   return training_ops.apply_gradient_descent(
       var,
       self._learning_rate_tensor,
       grad,
       use_locking=self._use_locking).op