Esempio n. 1
0
    def jsd(self, y_true, y_pred):
        # normalize
        y_pred = self.norm(y_pred)
        y_true = self.norm(y_true)

        m = y_true + y_pred
        m = math_ops.scalar_mul(0.5, m)

        entropy_pred = tf.keras.metrics.kullback_leibler_divergence(y_pred, m)
        entropy_true = tf.keras.metrics.kullback_leibler_divergence(y_true, m)

        metric = entropy_pred + entropy_true
        metric = math_ops.scalar_mul(0.5, metric)

        return metric
    def clip_norm(self, g, c, n):
        """
        
        :param g:
        :param c:
        :param n:
        :return:
        """
        """Clip a tensor by norm.
        Arguments:
          g: gradient tensor to clip.
          c: clipping threshold.
          n: norm of gradient tensor.
        Returns:
          Clipped gradient tensor.
        """
        if c > 0:
            condition = n >= c
            then_expression = lambda: math_ops.scalar_mul(c / n, g)
            else_expression = lambda: g

            if isinstance(g, ops.Tensor):
                g_shape = copy.copy(g.get_shape())
            elif isinstance(g, ops.IndexedSlices):
                g_shape = copy.copy(g.dense_shape)
            condition = tf.convert_to_tensor(condition, dtype=tf.bool)
            g = tf.cond(condition, then_expression, else_expression)
            if isinstance(g, ops.Tensor):
                g.set_shape(g_shape)
            elif isinstance(g, ops.IndexedSlices):
                g._dense_shape = g_shape

        return g
Esempio n. 3
0
  def testAcceptsTensor(self):
    tensor = array_ops.ones([10, 10])
    result = math_ops.scalar_mul(3, tensor)
    expected = array_ops.ones([10, 10]) * 3

    with test_util.device(use_gpu=True):
      self.assertAllEqual(self.evaluate(expected), self.evaluate(result))
        def compute_mean_fscore(name, weight=False):
            """Compute the mean per class accuracy via the confusion matrix."""
            per_row_sum = true_per_class = math_ops.to_float(
                math_ops.reduce_sum(total_cm, axis=1))
            per_col_sum = pred_per_class = math_ops.to_float(
                math_ops.reduce_sum(total_cm, axis=0))
            cm_diag = true_positive = math_ops.to_float(
                array_ops.diag_part(total_cm))

            def _safe_div_score(numerator, denominator):
                """return zero if denominator is zero"""
                return array_ops.where(math_ops.greater(denominator, 0),
                                       math_ops.div(numerator, denominator),
                                       array_ops.zeros_like(denominator))

            precision = _safe_div_score(true_positive, pred_per_class)
            recall = _safe_div_score(true_positive, true_per_class)

            numerator = math_ops.scalar_mul(
                2, math_ops.multiply(precision, recall))
            denominator = math_ops.add(precision, recall)
            fscores = _safe_div_score(numerator, denominator)

            if weight is False:
                return math_ops.reduce_mean(fscores, name=name)
            else:
                sum_values = math_ops.reduce_sum(
                    math_ops.multiply(fscores, true_per_class))
                num_values = math_ops.reduce_sum(true_per_class)
                return math_ops.div(sum_values, num_values, name=name)
Esempio n. 5
0
 def testAcceptsRefs(self):
     var = variables.Variable(10)
     result = math_ops.scalar_mul(3, var)
     init = variables.global_variables_initializer()
     with self.test_session(use_gpu=True) as sess:
         sess.run(init)
         self.assertEqual(30, result.eval())
Esempio n. 6
0
 def testScalarMul(self):
     with self.test_session():
         values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
         indices = constant_op.constant([0, 2])
         x = math_ops.scalar_mul(-2, ops.IndexedSlices(values, indices))
         self.assertAllEqual(x.values.eval(), [[-4, -6], [-10, -14]])
         self.assertAllEqual(x.indices.eval(), [0, 2])
Esempio n. 7
0
 def testAcceptsIndexedSlices(self):
   values = constant_op.constant([2, 3, 5, 7, 0, -1], shape=[3, 2])
   indices = constant_op.constant([0, 2, 5])
   x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices))
   with self.test_session(use_gpu=True):
     self.assertAllEqual(x.values.eval(), [[-6, -9], [-15, -21], [0, 3]])
     self.assertAllEqual(x.indices.eval(), [0, 2, 5])
Esempio n. 8
0
  def __init__(self, embedding, start_tokens, end_token, lm_logits):
    """Initializer.

    Args:
      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
        or the `params` argument for `embedding_lookup`.
      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
      end_token: `int32` scalar, the token that marks end of decoding.
      lm_logits: 
    Raises:
      ValueError: if `sequence_length` is not a 1D tensor.
    """
    if callable(embedding):
      self._embedding_fn = embedding
    else:
      self._embedding_fn = (
          lambda ids: embedding_ops.embedding_lookup(embedding, ids))

    self._penalized_lm_probs = math_ops.scalar_mul(conf.antilm_penalization_weight, self.logits_to_probs(lm_logits))
    self._start_tokens = ops.convert_to_tensor(
        start_tokens, dtype=dtypes.int32, name="start_tokens")
    self._end_token = ops.convert_to_tensor(
        end_token, dtype=dtypes.int32, name="end_token")
    if self._start_tokens.get_shape().ndims != 1:
      raise ValueError("start_tokens must be a vector")
    self._batch_size = array_ops.size(start_tokens)
    if self._end_token.get_shape().ndims != 0:
      raise ValueError("end_token must be a scalar")
    self._start_inputs = self._embedding_fn(self._start_tokens)
Esempio n. 9
0
 def testAcceptsRefs(self):
   var = variables.Variable(10)
   result = math_ops.scalar_mul(3, var)
   init = variables.initialize_all_variables()
   with self.test_session() as sess:
     sess.run(init)
     self.assertEqual(30, result.eval())
Esempio n. 10
0
    def testAcceptsTensor(self):
        tensor = array_ops.ones([10, 10])
        result = math_ops.scalar_mul(3, tensor)
        expected = array_ops.ones([10, 10]) * 3

        with test_util.device(use_gpu=True):
            self.assertAllEqual(self.evaluate(expected), self.evaluate(result))
Esempio n. 11
0
    def testAcceptsTensor(self):
        tensor = array_ops.ones([10, 10])
        result = math_ops.scalar_mul(3, tensor)
        expected = array_ops.ones([10, 10]) * 3

        with self.test_session(use_gpu=True):
            self.assertAllEqual(expected.eval(), result.eval())
Esempio n. 12
0
def clip_norm(g, c, n):
  """Clip a tensor by norm.

  Arguments:
    g: gradient tensor to clip.
    c: clipping threshold.
    n: norm of gradient tensor.

  Returns:
    Clipped gradient tensor.
  """
  if c > 0:
    condition = n >= c
    then_expression = lambda: math_ops.scalar_mul(c / n, g)
    else_expression = lambda: g

    # saving the shape to avoid converting sparse tensor to dense
    if isinstance(g, ops.Tensor):
      g_shape = copy.copy(g.get_shape())
    elif isinstance(g, ops.IndexedSlices):
      g_shape = copy.copy(g.dense_shape)
    if condition.dtype != dtypes_module.bool:
      condition = math_ops.cast(condition, 'bool')
    g = control_flow_ops.cond(condition, then_expression, else_expression)
    if isinstance(g, ops.Tensor):
      g.set_shape(g_shape)
    elif isinstance(g, ops.IndexedSlices):
      g._dense_shape = g_shape  # pylint: disable=protected-access
  return g
Esempio n. 13
0
 def testAcceptsRefs(self):
   var = variables.Variable(10)
   result = math_ops.scalar_mul(3, var)
   init = variables.global_variables_initializer()
   with self.test_session(use_gpu=True) as sess:
     sess.run(init)
     self.assertEqual(30, result.eval())
Esempio n. 14
0
  def testAcceptsTensor(self):
    tensor = array_ops.ones([10, 10])
    result = math_ops.scalar_mul(3, tensor)
    expected = array_ops.ones([10, 10]) * 3

    with self.test_session(use_gpu=True):
      self.assertAllEqual(expected.eval(), result.eval())
Esempio n. 15
0
 def testAcceptsIndexedSlices(self):
   values = constant_op.constant([2, 3, 5, 7, 0, -1], shape=[3, 2])
   indices = constant_op.constant([0, 2, 5])
   x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices))
   with self.test_session(use_gpu=True):
     self.assertAllEqual(x.values.eval(), [[-6, -9], [-15, -21], [0, 3]])
     self.assertAllEqual(x.indices.eval(), [0, 2, 5])
Esempio n. 16
0
 def testScalarMul(self):
   with self.test_session():
     values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
     indices = constant_op.constant([0, 2])
     x = math_ops.scalar_mul(-2, ops.IndexedSlices(values, indices))
     self.assertAllEqual(x.values.eval(), [[-4, -6], [-10, -14]])
     self.assertAllEqual(x.indices.eval(), [0, 2])
Esempio n. 17
0
 def testAcceptsRefs(self):
   var = variables.Variable(10)
   result = math_ops.scalar_mul(3, var)
   init = variables.initialize_all_variables()
   with self.test_session() as sess:
     sess.run(init)
     self.assertEqual(30, result.eval())
Esempio n. 18
0
def clip_norm(g, c, n):
    """Clip a tensor by norm.

  Arguments:
    g: gradient tensor to clip.
    c: clipping threshold.
    n: norm of gradient tensor.

  Returns:
    Clipped gradient tensor.
  """
    if c > 0:
        condition = n >= c
        then_expression = lambda: math_ops.scalar_mul(c / n, g)
        else_expression = lambda: g

        # saving the shape to avoid converting sparse tensor to dense
        if isinstance(g, ops.Tensor):
            g_shape = copy.copy(g.get_shape())
        elif isinstance(g, ops.IndexedSlices):
            g_shape = copy.copy(g.dense_shape)
        if condition.dtype != dtypes_module.bool:
            condition = math_ops.cast(condition, 'bool')
        g = control_flow_ops.cond(condition, then_expression, else_expression)
        if isinstance(g, ops.Tensor):
            g.set_shape(g_shape)
        elif isinstance(g, ops.IndexedSlices):
            g._dense_shape = g_shape  # pylint: disable=protected-access
    return g
Esempio n. 19
0
def get_larc_optimizer(opt_type,
                       loss,
                       global_step,
                       learning_rate,
                       momentum=0.,
                       LARC_mode="clip",
                       LARC_eta=0.002,
                       LARC_epsilon=1. / 16000.):
    #set up optimizers
    if opt_type == "Adam":
        optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
    elif opt_type == "RMSProp":
        optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
    elif opt_type == "SGD":
        optim = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=momentum)
    else:
        raise ValueError("Error, optimizer {} unsupported.".format(opt_type))

    #horovod wrapper
    if horovod:
        optim = hvd.DistributedOptimizer(optim)

    #compute gradients
    grads_and_vars = optim.compute_gradients(loss)
    for idx, (g, v) in enumerate(grads_and_vars):
        if g is not None:
            if horovod:
                local_sum = tf.reduce_sum(tf.square(v))
                v_norm = tf.sqrt(hvd.allreduce(local_sum))
            else:
                v_norm = linalg_ops.norm(tensor=v, ord=2)
            g_norm = linalg_ops.norm(tensor=g, ord=2)

            larc_local_lr = control_flow_ops.cond(
                pred=math_ops.logical_and(
                    math_ops.not_equal(v_norm, tf.constant(0.0)),
                    math_ops.not_equal(g_norm, tf.constant(0.0))),
                true_fn=lambda: LARC_eta * v_norm / g_norm,
                false_fn=lambda: LARC_epsilon)

            if LARC_mode == "scale":
                effective_lr = larc_local_lr
            else:
                effective_lr = math_ops.minimum(larc_local_lr, 1.0)

            #multiply gradients
            grads_and_vars[idx] = (math_ops.scalar_mul(effective_lr, g), v)

    #apply gradients:
    grad_updates = optim.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Ensure the train_tensor computes grad_updates.
    with tf.control_dependencies([loss]):
        return grad_updates
Esempio n. 20
0
 def testAcceptsRefs(self):
     if context.executing_eagerly():
         var = resource_variable_ops.ResourceVariable(10, name="var")
     else:
         var = variables.Variable(10)
     result = math_ops.scalar_mul(3, var)
     init = variables.global_variables_initializer()
     with test_util.device(use_gpu=True):
         self.evaluate(init)
         self.assertEqual(30, self.evaluate(result))
Esempio n. 21
0
 def testAcceptsRefs(self):
   if context.executing_eagerly():
     var = resource_variable_ops.ResourceVariable(10, name="var")
   else:
     var = variables.Variable(10)
   result = math_ops.scalar_mul(3, var)
   init = variables.global_variables_initializer()
   with test_util.device(use_gpu=True):
     self.evaluate(init)
     self.assertEqual(30, self.evaluate(result))
Esempio n. 22
0
def _SquaredDifferenceGrad(op, grad):
  """Returns the gradient for (x-y)^2."""
  x = op.inputs[0]
  y = op.inputs[1]
  sx = array_ops.shape(x)
  sy = array_ops.shape(y)
  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
  with ops.control_dependencies([grad]):
    # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
    # Tensor (not a number like 2.0) which causes it to convert to Tensor.
    x_grad = math_ops.scalar_mul(2.0, grad) * (x - y)
  return (array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx),
          -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy))
Esempio n. 23
0
def _SquaredDifferenceGrad(op, grad):
    """Returns the gradient for (x-y)^2."""
    x = op.inputs[0]
    y = op.inputs[1]
    sx = array_ops.shape(x)
    sy = array_ops.shape(y)
    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
    with ops.control_dependencies([grad]):
        # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
        # Tensor (not a number like 2.0) which causes it to convert to Tensor.
        x_grad = math_ops.scalar_mul(2.0, grad) * (x - y)
    return (array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx),
            -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy))
    def _matrix_update(self, A, h):
        """ Updates a second weight matrix according to the
        fast weight update rule described by Ba et. al. (2016)

        Args:
            A: `3-D` tensor with shape `[batch_size x state_size x state_size]`
                -> the fast weight matrix
            h: `2-D` tensor with shape `[batch_size x state_size]`
                -> the last network state

        Returns:
            A `3-D` tensor with shape `[batch_size x state_size x state_size]`, i.e.
            the new fast weight matrix A
        """
        #NOTE: Might be a case where name_scope is more appropriate! (ops.name_scope)
        with ops.name_scope("fast_weight_update"):
            h_reshape = tf.reshape(h, [-1,1,self._num_units])
            A = math_ops.scalar_mul(self._lam, A) + \
                self._eta * math_ops.matmul(array_ops.transpose(h_reshape, [0,2,1]), h_reshape)
        return A
Esempio n. 25
0
 def __call__(self, inputs, state, scope=None):
   """Run the cell and add its inputs to its outputs.
   Args:
     inputs: cell inputs.
     state: cell state.
     scope: optional cell scope.
   Returns:
     Tuple of cell outputs and new state.
   Raises:
     TypeError: If cell inputs and outputs have different structure (type).
     ValueError: If cell inputs and outputs have different structure (value).
   """
   outputs, new_state = self._cell(inputs, state, scope=scope)
   nest.assert_same_structure(inputs, outputs)
   # Ensure shapes match
   def assert_shape_match(inp, out):
     inp.get_shape().assert_is_compatible_with(out.get_shape())
   nest.map_structure(assert_shape_match, inputs, outputs)
   res_outputs = nest.map_structure(
       lambda inp, out: math_ops.scalar_mul(0.5, inp + out), inputs, outputs)
   return res_outputs, new_state
Esempio n. 26
0
    def __init__(self,
                 layers,
                 weights=None,
                 merge_fn=math_ops.add_n,
                 name="merge"):
        if len(layers) < 2:
            raise Exception("Expecting a list of layers with len >= 2")

        if weights is not None and len(weights) != len(layers):
            raise Exception("len(weights) must be equals to len(layers)")

        super().__init__(layers, layers[0].n_units, layers[0].shape,
                         layers[0].dtype, name)

        with name_scope(name):
            if weights is not None:
                tensors = [
                    math_ops.scalar_mul(weights[i], layers[i].tensor)
                    for i in range(len(layers))
                ]
            else:
                tensors = [layer.tensor for layer in layers]
            self.tensor = merge_fn(tensors)
Esempio n. 27
0
 def testAcceptsConstant(self):
   const = constant_op.constant(10)
   result = math_ops.scalar_mul(3, const)
   with test_util.device(use_gpu=True):
     self.assertEqual(30, self.evaluate(result))
Esempio n. 28
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  LARS_nu=None,
                  LARS_epsilon=1.0/16384.0,
                  loss_scale=1.0):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARS_nu: If not None, LARS re-scaling will be applied https://arxiv.org/pdf/1708.03888.pdf with
      nu=LARS_nu
    LARS_epsilon: If either weight or gradient norm is zero, this will be returned as local LR

  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = contrib_framework.get_global_step()
  else:
    contrib_framework.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    # Learning rate variable, with possible decay.
    lr = None
    if learning_rate is not None:
      if (isinstance(learning_rate, ops.Tensor) and
          learning_rate.get_shape().ndims == 0):
        lr = learning_rate
      elif isinstance(learning_rate, float):
        if learning_rate < 0.0:
          raise ValueError("Invalid learning_rate %s.", learning_rate)
        lr = vs.get_variable(
            "learning_rate", [],
            trainable=False,
            initializer=init_ops.constant_initializer(learning_rate))
      else:
        raise ValueError("Learning rate should be 0d Tensor or float. "
                         "Got %s of type %s" % (str(learning_rate),
                                                str(type(learning_rate))))
    if summaries is None:
      summaries = ["loss", "learning_rate", "global_gradient_norm"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if learning_rate is not None and learning_rate_decay_fn is not None:
      if global_step is None:
        raise ValueError("global_step is required for learning_rate_decay_fn.")
      lr = learning_rate_decay_fn(lr, global_step)
      if "learning_rate" in summaries:
        summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if learning_rate is not None:
        opt = optimizer(lr)
      else:
        opt = optimizer()
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))

    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    # Compute gradients.
    gradients = opt.compute_gradients(
        loss if loss_scale==1.0 else loss_scale*loss,
        variables,
        colocate_gradients_with_ops=colocate_gradients_with_ops)
    if loss_scale!=1.0:
      gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale)

    # LARS gradient re-scaling
    if LARS_nu is not None and isinstance(LARS_nu, float):
      for idx, (g, v) in enumerate(gradients):
        v_norm = linalg_ops.norm(tensor=v, ord=2)
        g_norm = linalg_ops.norm(tensor=g, ord=2)
        lars_local_lr = control_flow_ops.cond(
          pred = math_ops.logical_and(math_ops.not_equal(v_norm, array_ops.constant(0.0)),
                                      math_ops.not_equal(g_norm, array_ops.constant(0.0))),
          true_fn = lambda: LARS_nu * v_norm / g_norm,
          false_fn = lambda: LARS_epsilon)
        gradients[idx] = (math_ops.scalar_mul(lars_local_lr, g), v)

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
      summary.scalar("global_norm/gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Optionally clip gradients by global norm.
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add scalar summary for loss.
    if "loss" in summaries:
      summary.scalar("loss", loss)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, grad_values)
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))

    if clip_gradients is not None and ("global_gradient_norm" in summaries or
                                       "gradient_norm" in summaries):
      summary.scalar("global_norm/clipped_gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
Esempio n. 29
0
 def norm(self, x):
     norm = math_ops.reduce_sum(x)
     norm = 1 / norm
     x = math_ops.scalar_mul(norm, x)
     return x
Esempio n. 30
0
 def testAcceptsConstant(self):
   const = constant_op.constant(10)
   result = math_ops.scalar_mul(3, const)
   with self.test_session(use_gpu=True):
     self.assertEqual(30, result.eval())
Esempio n. 31
0
 def testAcceptsConstant(self):
     const = constant_op.constant(10)
     result = math_ops.scalar_mul(3, const)
     with test_util.device(use_gpu=True):
         self.assertEqual(30, self.evaluate(result))
def get_larc_optimizer(optimizer, loss, global_step, steps_per_epoch,
                       use_horovod):
    #get learning rate
    learning_rate = get_learning_rate(optimizer, global_step, steps_per_epoch)

    #get LARC stuff
    LARC_mode = get_dict_default(optimizer, "LARC_mode", "clip")
    LARC_eta = get_dict_default(optimizer, "LARC_eta", 0.002)
    LARC_epsilon = get_dict_default(optimizer, "LARC_epsilon", 1. / 16000.)

    #lag
    gradient_lag = get_dict_default(optimizer, "gradient_lag", 0)

    #set up optimizers
    opt_type = get_dict_default(optimizer, "opt_type", "LARC-Adam")

    #set up optimizers
    if opt_type == "LARC-Adam":
        beta1 = get_dict_default(optimizer, "beta1", 0.9)
        beta2 = get_dict_default(optimizer, "beta2", 0.999)
        optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
#        optim = tf.train.experimental.enable_mixed_precision_graph_rewrite(optim)
    elif opt_type == "LARC-RMSProp":
        optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
    elif opt_type == "LARC-SGD":
        momentum = get_dict_default(optimizer, "momentum", 0.)
        optim = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=momentum)
    else:
        raise ValueError("Error, optimizer {} unsupported.".format(opt_type))

    # instead of using the horovod wrapper, we do the allreduce ourselves below

    #compute gradients
    grads_and_vars = optim.compute_gradients(loss)
    lag_ops = []
    for idx, (g, v) in enumerate(grads_and_vars):
        if g is not None:
            if gradient_lag > 0:
                g_lag = tf.Variable(initial_value=tf.zeros(g.shape, g.dtype),
                                    trainable=False,
                                    name=v.name.replace(":", "_") + '_lag')
                g_next = g
                g = g_lag

            if use_horovod and (hvd.size() > 1):
                # if we ask for an average, it does a scalar divide, but
                #  we can bake that into the scaling below
                g = hvd.allreduce(g, average=False)
                g_scale = 1. / hvd.size()
            else:
                g_scale = 1

            v_norm = linalg_ops.norm(tensor=v, ord=2)
            g_norm = linalg_ops.norm(tensor=g, ord=2)

            larc_local_lr = control_flow_ops.cond(
                pred=math_ops.logical_and(
                    math_ops.not_equal(v_norm, tf.constant(0.0)),
                    math_ops.not_equal(g_norm, tf.constant(0.0))),
                true_fn=lambda: (LARC_eta / g_scale) * v_norm / g_norm,
                false_fn=lambda: LARC_epsilon)

            if LARC_mode == "scale":
                effective_lr = larc_local_lr
            else:
                # DEBUG
                #effective_lr = math_ops.minimum(larc_local_lr, 1.0)
                #we need to see which LR to take and then divide out the LR because otherwise it will be multiplied in
                #again when we apply the gradients
                effective_lr = math_ops.minimum(larc_local_lr,
                                                learning_rate) / learning_rate
                # DEBUG

            # rescale gradients
            effective_lr *= g_scale

            #multiply gradients
            g_scaled = math_ops.scalar_mul(effective_lr, g)
            grads_and_vars[idx] = (g_scaled, v)

            if gradient_lag > 0:
                # once we've computed g_scaled, it's safe to overwrite g_lag
                with tf.control_dependencies([g_scaled]):
                    lag_ops.append(g_lag.assign(g_next))

    #apply gradients, making sure to complete the forward pass first
    with tf.control_dependencies([loss]):
        grad_updates = optim.apply_gradients(grads_and_vars,
                                             global_step=global_step)
    if gradient_lag > 0:
        grad_updates = tf.group([grad_updates] + lag_ops)

    return grad_updates, learning_rate
Esempio n. 33
0
 def testAcceptsConstant(self):
     const = constant_op.constant(10)
     result = math_ops.scalar_mul(3, const)
     with self.test_session(use_gpu=True):
         self.assertEqual(30, result.eval())
Esempio n. 34
0
  def gradients_with_scaling(ys,
                             xs,
                             grad_ys=None,
                             name="gradients",
                             colocate_gradients_with_ops=False,
                             gate_gradients=False,
                             aggregation_method=None,
                             stop_gradients=None,
                             unconnected_gradients=UnconnectedGradients.NONE):
    # with constant loss scaling
    ys = _AsList(ys)
    mp_config = _current_mp_config()
    # if mp_config is empty
    if not mp_config or len(ys) == 0 or ys[0].dtype == dtypes.variant:
      grads = gradients(ys, xs, grad_ys, name,
                        colocate_gradients_with_ops,
                        gate_gradients,
                        aggregation_method,
                        stop_gradients,
                        unconnected_gradients)
      return grads

    scale = 1.0
    if mp_config.get('auto'):
      scale = mp_config['auto'].loss_scale
    elif mp_config.get('constant'):
      scale = mp_config['constant']
    if isinstance(scale, ops.Tensor) or scale != 1.0:
      with ops.name_scope(name, "gradients"):
        gradient_uid = ops.get_default_graph().unique_name("uid",
                                                           mark_as_used=False)
        scaled_ys = []
        scale_ts = ops.convert_to_tensor(scale)
        for y in ys:
          with _maybe_colocate_with(y.op,
                                    gradient_uid,
                                    colocate_gradients_with_ops):
            y = math_ops.scalar_mul(math_ops.cast(scale_ts, dtype=y.dtype), y)
          scaled_ys.append(y)
        ys = scaled_ys
    grads_scaled = gradients(ys, xs, grad_ys,
                             name,
                             colocate_gradients_with_ops,
                             gate_gradients,
                             aggregation_method,
                             stop_gradients,
                             unconnected_gradients)
    if isinstance(scale, ops.Tensor) or scale != 1.0:
      with ops.name_scope(name, "gradients"):
        unscale = 1.0 / scale
        unscale_ts = ops.convert_to_tensor(unscale)
        grads = []
        for grad in grads_scaled:
          if grad is not None:
            with _maybe_colocate_with(grad.op,
                                      gradient_uid,
                                      colocate_gradients_with_ops):
              grad = math_ops.scalar_mul(
                  math_ops.cast(unscale_ts, dtype=grad.dtype), grad)
          grads.append(grad)
    else:
      grads = grads_scaled

    # if auto scaling: check nan and inf
    if mp_config.get('auto'):
      # check the grads
      grad_has_nans, grad_amax = AutomaticLossScaler.check_grads(grads)
      # the gradients will be ignored in the following two cases:
      #   1) there is Nan in the gradients;
      #   2) the maximum value is infinity
      should_skip_update = math_ops.logical_or(math_ops.is_inf(grad_amax),
                                               grad_has_nans)
      loss_scale_update_op = mp_config['auto'].update_op(grad_has_nans,
                                                         grad_amax)
      grads_update = []
      with ops.control_dependencies([loss_scale_update_op]):
        for grad in grads:
          if grad is not None:
            with _maybe_colocate_with(grad.op,
                                      gradient_uid,
                                      colocate_gradients_with_ops):
              grad_zero = _zero_grad(grad)
              grad = control_flow_ops.cond(should_skip_update,
                                           lambda: grad_zero,
                                           lambda: grad)
          grads_update.append(grad)
      return grads_update
    return grads