def jsd(self, y_true, y_pred): # normalize y_pred = self.norm(y_pred) y_true = self.norm(y_true) m = y_true + y_pred m = math_ops.scalar_mul(0.5, m) entropy_pred = tf.keras.metrics.kullback_leibler_divergence(y_pred, m) entropy_true = tf.keras.metrics.kullback_leibler_divergence(y_true, m) metric = entropy_pred + entropy_true metric = math_ops.scalar_mul(0.5, metric) return metric
def clip_norm(self, g, c, n): """ :param g: :param c: :param n: :return: """ """Clip a tensor by norm. Arguments: g: gradient tensor to clip. c: clipping threshold. n: norm of gradient tensor. Returns: Clipped gradient tensor. """ if c > 0: condition = n >= c then_expression = lambda: math_ops.scalar_mul(c / n, g) else_expression = lambda: g if isinstance(g, ops.Tensor): g_shape = copy.copy(g.get_shape()) elif isinstance(g, ops.IndexedSlices): g_shape = copy.copy(g.dense_shape) condition = tf.convert_to_tensor(condition, dtype=tf.bool) g = tf.cond(condition, then_expression, else_expression) if isinstance(g, ops.Tensor): g.set_shape(g_shape) elif isinstance(g, ops.IndexedSlices): g._dense_shape = g_shape return g
def testAcceptsTensor(self): tensor = array_ops.ones([10, 10]) result = math_ops.scalar_mul(3, tensor) expected = array_ops.ones([10, 10]) * 3 with test_util.device(use_gpu=True): self.assertAllEqual(self.evaluate(expected), self.evaluate(result))
def compute_mean_fscore(name, weight=False): """Compute the mean per class accuracy via the confusion matrix.""" per_row_sum = true_per_class = math_ops.to_float( math_ops.reduce_sum(total_cm, axis=1)) per_col_sum = pred_per_class = math_ops.to_float( math_ops.reduce_sum(total_cm, axis=0)) cm_diag = true_positive = math_ops.to_float( array_ops.diag_part(total_cm)) def _safe_div_score(numerator, denominator): """return zero if denominator is zero""" return array_ops.where(math_ops.greater(denominator, 0), math_ops.div(numerator, denominator), array_ops.zeros_like(denominator)) precision = _safe_div_score(true_positive, pred_per_class) recall = _safe_div_score(true_positive, true_per_class) numerator = math_ops.scalar_mul( 2, math_ops.multiply(precision, recall)) denominator = math_ops.add(precision, recall) fscores = _safe_div_score(numerator, denominator) if weight is False: return math_ops.reduce_mean(fscores, name=name) else: sum_values = math_ops.reduce_sum( math_ops.multiply(fscores, true_per_class)) num_values = math_ops.reduce_sum(true_per_class) return math_ops.div(sum_values, num_values, name=name)
def testAcceptsRefs(self): var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.global_variables_initializer() with self.test_session(use_gpu=True) as sess: sess.run(init) self.assertEqual(30, result.eval())
def testScalarMul(self): with self.test_session(): values = constant_op.constant([2, 3, 5, 7], shape=[2, 2]) indices = constant_op.constant([0, 2]) x = math_ops.scalar_mul(-2, ops.IndexedSlices(values, indices)) self.assertAllEqual(x.values.eval(), [[-4, -6], [-10, -14]]) self.assertAllEqual(x.indices.eval(), [0, 2])
def testAcceptsIndexedSlices(self): values = constant_op.constant([2, 3, 5, 7, 0, -1], shape=[3, 2]) indices = constant_op.constant([0, 2, 5]) x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices)) with self.test_session(use_gpu=True): self.assertAllEqual(x.values.eval(), [[-6, -9], [-15, -21], [0, 3]]) self.assertAllEqual(x.indices.eval(), [0, 2, 5])
def __init__(self, embedding, start_tokens, end_token, lm_logits): """Initializer. Args: embedding: A callable that takes a vector tensor of `ids` (argmax ids), or the `params` argument for `embedding_lookup`. start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. end_token: `int32` scalar, the token that marks end of decoding. lm_logits: Raises: ValueError: if `sequence_length` is not a 1D tensor. """ if callable(embedding): self._embedding_fn = embedding else: self._embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(embedding, ids)) self._penalized_lm_probs = math_ops.scalar_mul(conf.antilm_penalization_weight, self.logits_to_probs(lm_logits)) self._start_tokens = ops.convert_to_tensor( start_tokens, dtype=dtypes.int32, name="start_tokens") self._end_token = ops.convert_to_tensor( end_token, dtype=dtypes.int32, name="end_token") if self._start_tokens.get_shape().ndims != 1: raise ValueError("start_tokens must be a vector") self._batch_size = array_ops.size(start_tokens) if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") self._start_inputs = self._embedding_fn(self._start_tokens)
def testAcceptsRefs(self): var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.initialize_all_variables() with self.test_session() as sess: sess.run(init) self.assertEqual(30, result.eval())
def testAcceptsTensor(self): tensor = array_ops.ones([10, 10]) result = math_ops.scalar_mul(3, tensor) expected = array_ops.ones([10, 10]) * 3 with test_util.device(use_gpu=True): self.assertAllEqual(self.evaluate(expected), self.evaluate(result))
def testAcceptsTensor(self): tensor = array_ops.ones([10, 10]) result = math_ops.scalar_mul(3, tensor) expected = array_ops.ones([10, 10]) * 3 with self.test_session(use_gpu=True): self.assertAllEqual(expected.eval(), result.eval())
def clip_norm(g, c, n): """Clip a tensor by norm. Arguments: g: gradient tensor to clip. c: clipping threshold. n: norm of gradient tensor. Returns: Clipped gradient tensor. """ if c > 0: condition = n >= c then_expression = lambda: math_ops.scalar_mul(c / n, g) else_expression = lambda: g # saving the shape to avoid converting sparse tensor to dense if isinstance(g, ops.Tensor): g_shape = copy.copy(g.get_shape()) elif isinstance(g, ops.IndexedSlices): g_shape = copy.copy(g.dense_shape) if condition.dtype != dtypes_module.bool: condition = math_ops.cast(condition, 'bool') g = control_flow_ops.cond(condition, then_expression, else_expression) if isinstance(g, ops.Tensor): g.set_shape(g_shape) elif isinstance(g, ops.IndexedSlices): g._dense_shape = g_shape # pylint: disable=protected-access return g
def testAcceptsRefs(self): var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.global_variables_initializer() with self.test_session(use_gpu=True) as sess: sess.run(init) self.assertEqual(30, result.eval())
def testAcceptsTensor(self): tensor = array_ops.ones([10, 10]) result = math_ops.scalar_mul(3, tensor) expected = array_ops.ones([10, 10]) * 3 with self.test_session(use_gpu=True): self.assertAllEqual(expected.eval(), result.eval())
def testAcceptsIndexedSlices(self): values = constant_op.constant([2, 3, 5, 7, 0, -1], shape=[3, 2]) indices = constant_op.constant([0, 2, 5]) x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices)) with self.test_session(use_gpu=True): self.assertAllEqual(x.values.eval(), [[-6, -9], [-15, -21], [0, 3]]) self.assertAllEqual(x.indices.eval(), [0, 2, 5])
def testScalarMul(self): with self.test_session(): values = constant_op.constant([2, 3, 5, 7], shape=[2, 2]) indices = constant_op.constant([0, 2]) x = math_ops.scalar_mul(-2, ops.IndexedSlices(values, indices)) self.assertAllEqual(x.values.eval(), [[-4, -6], [-10, -14]]) self.assertAllEqual(x.indices.eval(), [0, 2])
def testAcceptsRefs(self): var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.initialize_all_variables() with self.test_session() as sess: sess.run(init) self.assertEqual(30, result.eval())
def clip_norm(g, c, n): """Clip a tensor by norm. Arguments: g: gradient tensor to clip. c: clipping threshold. n: norm of gradient tensor. Returns: Clipped gradient tensor. """ if c > 0: condition = n >= c then_expression = lambda: math_ops.scalar_mul(c / n, g) else_expression = lambda: g # saving the shape to avoid converting sparse tensor to dense if isinstance(g, ops.Tensor): g_shape = copy.copy(g.get_shape()) elif isinstance(g, ops.IndexedSlices): g_shape = copy.copy(g.dense_shape) if condition.dtype != dtypes_module.bool: condition = math_ops.cast(condition, 'bool') g = control_flow_ops.cond(condition, then_expression, else_expression) if isinstance(g, ops.Tensor): g.set_shape(g_shape) elif isinstance(g, ops.IndexedSlices): g._dense_shape = g_shape # pylint: disable=protected-access return g
def get_larc_optimizer(opt_type, loss, global_step, learning_rate, momentum=0., LARC_mode="clip", LARC_eta=0.002, LARC_epsilon=1. / 16000.): #set up optimizers if opt_type == "Adam": optim = tf.train.AdamOptimizer(learning_rate=learning_rate) elif opt_type == "RMSProp": optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate) elif opt_type == "SGD": optim = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) else: raise ValueError("Error, optimizer {} unsupported.".format(opt_type)) #horovod wrapper if horovod: optim = hvd.DistributedOptimizer(optim) #compute gradients grads_and_vars = optim.compute_gradients(loss) for idx, (g, v) in enumerate(grads_and_vars): if g is not None: if horovod: local_sum = tf.reduce_sum(tf.square(v)) v_norm = tf.sqrt(hvd.allreduce(local_sum)) else: v_norm = linalg_ops.norm(tensor=v, ord=2) g_norm = linalg_ops.norm(tensor=g, ord=2) larc_local_lr = control_flow_ops.cond( pred=math_ops.logical_and( math_ops.not_equal(v_norm, tf.constant(0.0)), math_ops.not_equal(g_norm, tf.constant(0.0))), true_fn=lambda: LARC_eta * v_norm / g_norm, false_fn=lambda: LARC_epsilon) if LARC_mode == "scale": effective_lr = larc_local_lr else: effective_lr = math_ops.minimum(larc_local_lr, 1.0) #multiply gradients grads_and_vars[idx] = (math_ops.scalar_mul(effective_lr, g), v) #apply gradients: grad_updates = optim.apply_gradients(grads_and_vars, global_step=global_step) # Ensure the train_tensor computes grad_updates. with tf.control_dependencies([loss]): return grad_updates
def testAcceptsRefs(self): if context.executing_eagerly(): var = resource_variable_ops.ResourceVariable(10, name="var") else: var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.global_variables_initializer() with test_util.device(use_gpu=True): self.evaluate(init) self.assertEqual(30, self.evaluate(result))
def testAcceptsRefs(self): if context.executing_eagerly(): var = resource_variable_ops.ResourceVariable(10, name="var") else: var = variables.Variable(10) result = math_ops.scalar_mul(3, var) init = variables.global_variables_initializer() with test_util.device(use_gpu=True): self.evaluate(init) self.assertEqual(30, self.evaluate(result))
def _SquaredDifferenceGrad(op, grad): """Returns the gradient for (x-y)^2.""" x = op.inputs[0] y = op.inputs[1] sx = array_ops.shape(x) sy = array_ops.shape(y) rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy) with ops.control_dependencies([grad]): # The parens ensure that if grad is IndexedSlices, it'll get multiplied by # Tensor (not a number like 2.0) which causes it to convert to Tensor. x_grad = math_ops.scalar_mul(2.0, grad) * (x - y) return (array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx), -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy))
def _SquaredDifferenceGrad(op, grad): """Returns the gradient for (x-y)^2.""" x = op.inputs[0] y = op.inputs[1] sx = array_ops.shape(x) sy = array_ops.shape(y) rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy) with ops.control_dependencies([grad]): # The parens ensure that if grad is IndexedSlices, it'll get multiplied by # Tensor (not a number like 2.0) which causes it to convert to Tensor. x_grad = math_ops.scalar_mul(2.0, grad) * (x - y) return (array_ops.reshape(math_ops.reduce_sum(x_grad, rx), sx), -array_ops.reshape(math_ops.reduce_sum(x_grad, ry), sy))
def _matrix_update(self, A, h): """ Updates a second weight matrix according to the fast weight update rule described by Ba et. al. (2016) Args: A: `3-D` tensor with shape `[batch_size x state_size x state_size]` -> the fast weight matrix h: `2-D` tensor with shape `[batch_size x state_size]` -> the last network state Returns: A `3-D` tensor with shape `[batch_size x state_size x state_size]`, i.e. the new fast weight matrix A """ #NOTE: Might be a case where name_scope is more appropriate! (ops.name_scope) with ops.name_scope("fast_weight_update"): h_reshape = tf.reshape(h, [-1,1,self._num_units]) A = math_ops.scalar_mul(self._lam, A) + \ self._eta * math_ops.matmul(array_ops.transpose(h_reshape, [0,2,1]), h_reshape) return A
def __call__(self, inputs, state, scope=None): """Run the cell and add its inputs to its outputs. Args: inputs: cell inputs. state: cell state. scope: optional cell scope. Returns: Tuple of cell outputs and new state. Raises: TypeError: If cell inputs and outputs have different structure (type). ValueError: If cell inputs and outputs have different structure (value). """ outputs, new_state = self._cell(inputs, state, scope=scope) nest.assert_same_structure(inputs, outputs) # Ensure shapes match def assert_shape_match(inp, out): inp.get_shape().assert_is_compatible_with(out.get_shape()) nest.map_structure(assert_shape_match, inputs, outputs) res_outputs = nest.map_structure( lambda inp, out: math_ops.scalar_mul(0.5, inp + out), inputs, outputs) return res_outputs, new_state
def __init__(self, layers, weights=None, merge_fn=math_ops.add_n, name="merge"): if len(layers) < 2: raise Exception("Expecting a list of layers with len >= 2") if weights is not None and len(weights) != len(layers): raise Exception("len(weights) must be equals to len(layers)") super().__init__(layers, layers[0].n_units, layers[0].shape, layers[0].dtype, name) with name_scope(name): if weights is not None: tensors = [ math_ops.scalar_mul(weights[i], layers[i].tensor) for i in range(len(layers)) ] else: tensors = [layer.tensor for layer in layers] self.tensor = merge_fn(tensors)
def testAcceptsConstant(self): const = constant_op.constant(10) result = math_ops.scalar_mul(3, const) with test_util.device(use_gpu=True): self.assertEqual(30, self.evaluate(result))
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, LARS_nu=None, LARS_epsilon=1.0/16384.0, loss_scale=1.0): """Given loss and parameters for optimizer, returns a training op. Various ways of passing optimizers include: - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - by function taking learning rate `Tensor` as argument and returning an `Optimizer` instance. E.g. `optimize_loss(..., optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. Alternatively, if `learning_rate` is `None`, the function takes no arguments. E.g. `optimize_loss(..., learning_rate=None, optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - by a subclass of `Optimizer` having a single-argument constructor (the argument is the learning rate), such as AdamOptimizer or AdagradOptimizer. E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`. - by an instance of a subclass of `Optimizer`. E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. Args: loss: Scalar `Tensor`. global_step: Scalar int `Tensor`, step counter to update on each step unless `increment_global_step` is `False`. If not supplied, it will be fetched from the default graph (see `tf.train.get_global_step` for details). If it has not been created, no step will be incremented with each weight update. `learning_rate_decay_fn` requires `global_step`. learning_rate: float or `Tensor`, magnitude of update per each training step. Can be `None`. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantiation of `tf.Optimizer` sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float, callable or `None`. If float, is provided, a global clipping is applied to prevent the norm of the gradient to exceed this value. Alternatively, a callable can be provided e.g.: adaptive_clipping. This callable takes a `list` of `(gradients, variables)` `tuple`s and returns the same thing with the gradients modified. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. increment_global_step: Whether to increment `global_step`. If your model calls `optimize_loss` multiple times per training step (e.g. to optimize different parts of the model), use this arg to avoid incrementing `global_step` more times than necessary. LARS_nu: If not None, LARS re-scaling will be applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARS_nu LARS_epsilon: If either weight or gradient norm is zero, this will be returned as local LR Returns: Training op. Raises: ValueError: if: * `loss` is an invalid type or shape. * `global_step` is an invalid type or shape. * `learning_rate` is an invalid type or value. * `optimizer` has the wrong type. * `clip_gradients` is neither float nor callable. * `learning_rate` and `learning_rate_decay_fn` are supplied, but no `global_step` is available. * `gradients` is empty. """ loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: global_step = contrib_framework.get_global_step() else: contrib_framework.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Learning rate variable, with possible decay. lr = None if learning_rate is not None: if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): if learning_rate < 0.0: raise ValueError("Invalid learning_rate %s.", learning_rate) lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate", "global_gradient_norm"] else: for summ in summaries: if summ not in OPTIMIZER_SUMMARIES: raise ValueError("Summaries should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_SUMMARIES), summ)) if learning_rate is not None and learning_rate_decay_fn is not None: if global_step is None: raise ValueError("global_step is required for learning_rate_decay_fn.") lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: summary.scalar("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif (isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer)): if lr is None: raise ValueError("Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer) opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer elif callable(optimizer): if learning_rate is not None: opt = optimizer(lr) else: opt = optimizer() if not isinstance(opt, optimizer_.Optimizer): raise ValueError("Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)) else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer, instance of " "subclass of Optimizer or function with one argument. " "Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients( loss if loss_scale==1.0 else loss_scale*loss, variables, colocate_gradients_with_ops=colocate_gradients_with_ops) if loss_scale!=1.0: gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale) # LARS gradient re-scaling if LARS_nu is not None and isinstance(LARS_nu, float): for idx, (g, v) in enumerate(gradients): v_norm = linalg_ops.norm(tensor=v, ord=2) g_norm = linalg_ops.norm(tensor=g, ord=2) lars_local_lr = control_flow_ops.cond( pred = math_ops.logical_and(math_ops.not_equal(v_norm, array_ops.constant(0.0)), math_ops.not_equal(g_norm, array_ops.constant(0.0))), true_fn = lambda: LARS_nu * v_norm / g_norm, false_fn = lambda: LARS_epsilon) gradients[idx] = (math_ops.scalar_mul(lars_local_lr, g), v) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) if not gradients: raise ValueError( "Empty list of (gradient, var) pairs encountered. This is most " "likely to be caused by an improper value of gradient_multipliers.") if "global_gradient_norm" in summaries or "gradient_norm" in summaries: summary.scalar("global_norm/gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Optionally clip gradients by global norm. if isinstance(clip_gradients, float): gradients = _clip_gradients_by_norm(gradients, clip_gradients) elif callable(clip_gradients): gradients = clip_gradients(gradients) elif clip_gradients is not None: raise ValueError( "Unknown type %s for clip_gradients" % type(clip_gradients)) # Add scalar summary for loss. if "loss" in summaries: summary.scalar("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: summary.histogram("gradients/%s" % var_name, grad_values) if "gradient_norm" in summaries: summary.scalar("gradient_norm/%s" % var_name, clip_ops.global_norm([grad_values])) if clip_gradients is not None and ("global_gradient_norm" in summaries or "gradient_norm" in summaries): summary.scalar("global_norm/clipped_gradient_norm", clip_ops.global_norm(list(zip(*gradients))[0])) # Create gradient updates. grad_updates = opt.apply_gradients( gradients, global_step=global_step if increment_global_step else None, name="train") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def norm(self, x): norm = math_ops.reduce_sum(x) norm = 1 / norm x = math_ops.scalar_mul(norm, x) return x
def testAcceptsConstant(self): const = constant_op.constant(10) result = math_ops.scalar_mul(3, const) with self.test_session(use_gpu=True): self.assertEqual(30, result.eval())
def testAcceptsConstant(self): const = constant_op.constant(10) result = math_ops.scalar_mul(3, const) with test_util.device(use_gpu=True): self.assertEqual(30, self.evaluate(result))
def get_larc_optimizer(optimizer, loss, global_step, steps_per_epoch, use_horovod): #get learning rate learning_rate = get_learning_rate(optimizer, global_step, steps_per_epoch) #get LARC stuff LARC_mode = get_dict_default(optimizer, "LARC_mode", "clip") LARC_eta = get_dict_default(optimizer, "LARC_eta", 0.002) LARC_epsilon = get_dict_default(optimizer, "LARC_epsilon", 1. / 16000.) #lag gradient_lag = get_dict_default(optimizer, "gradient_lag", 0) #set up optimizers opt_type = get_dict_default(optimizer, "opt_type", "LARC-Adam") #set up optimizers if opt_type == "LARC-Adam": beta1 = get_dict_default(optimizer, "beta1", 0.9) beta2 = get_dict_default(optimizer, "beta2", 0.999) optim = tf.train.AdamOptimizer(learning_rate=learning_rate) # optim = tf.train.experimental.enable_mixed_precision_graph_rewrite(optim) elif opt_type == "LARC-RMSProp": optim = tf.train.RMSPropOptimizer(learning_rate=learning_rate) elif opt_type == "LARC-SGD": momentum = get_dict_default(optimizer, "momentum", 0.) optim = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) else: raise ValueError("Error, optimizer {} unsupported.".format(opt_type)) # instead of using the horovod wrapper, we do the allreduce ourselves below #compute gradients grads_and_vars = optim.compute_gradients(loss) lag_ops = [] for idx, (g, v) in enumerate(grads_and_vars): if g is not None: if gradient_lag > 0: g_lag = tf.Variable(initial_value=tf.zeros(g.shape, g.dtype), trainable=False, name=v.name.replace(":", "_") + '_lag') g_next = g g = g_lag if use_horovod and (hvd.size() > 1): # if we ask for an average, it does a scalar divide, but # we can bake that into the scaling below g = hvd.allreduce(g, average=False) g_scale = 1. / hvd.size() else: g_scale = 1 v_norm = linalg_ops.norm(tensor=v, ord=2) g_norm = linalg_ops.norm(tensor=g, ord=2) larc_local_lr = control_flow_ops.cond( pred=math_ops.logical_and( math_ops.not_equal(v_norm, tf.constant(0.0)), math_ops.not_equal(g_norm, tf.constant(0.0))), true_fn=lambda: (LARC_eta / g_scale) * v_norm / g_norm, false_fn=lambda: LARC_epsilon) if LARC_mode == "scale": effective_lr = larc_local_lr else: # DEBUG #effective_lr = math_ops.minimum(larc_local_lr, 1.0) #we need to see which LR to take and then divide out the LR because otherwise it will be multiplied in #again when we apply the gradients effective_lr = math_ops.minimum(larc_local_lr, learning_rate) / learning_rate # DEBUG # rescale gradients effective_lr *= g_scale #multiply gradients g_scaled = math_ops.scalar_mul(effective_lr, g) grads_and_vars[idx] = (g_scaled, v) if gradient_lag > 0: # once we've computed g_scaled, it's safe to overwrite g_lag with tf.control_dependencies([g_scaled]): lag_ops.append(g_lag.assign(g_next)) #apply gradients, making sure to complete the forward pass first with tf.control_dependencies([loss]): grad_updates = optim.apply_gradients(grads_and_vars, global_step=global_step) if gradient_lag > 0: grad_updates = tf.group([grad_updates] + lag_ops) return grad_updates, learning_rate
def testAcceptsConstant(self): const = constant_op.constant(10) result = math_ops.scalar_mul(3, const) with self.test_session(use_gpu=True): self.assertEqual(30, result.eval())
def gradients_with_scaling(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE): # with constant loss scaling ys = _AsList(ys) mp_config = _current_mp_config() # if mp_config is empty if not mp_config or len(ys) == 0 or ys[0].dtype == dtypes.variant: grads = gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients) return grads scale = 1.0 if mp_config.get('auto'): scale = mp_config['auto'].loss_scale elif mp_config.get('constant'): scale = mp_config['constant'] if isinstance(scale, ops.Tensor) or scale != 1.0: with ops.name_scope(name, "gradients"): gradient_uid = ops.get_default_graph().unique_name("uid", mark_as_used=False) scaled_ys = [] scale_ts = ops.convert_to_tensor(scale) for y in ys: with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops): y = math_ops.scalar_mul(math_ops.cast(scale_ts, dtype=y.dtype), y) scaled_ys.append(y) ys = scaled_ys grads_scaled = gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients) if isinstance(scale, ops.Tensor) or scale != 1.0: with ops.name_scope(name, "gradients"): unscale = 1.0 / scale unscale_ts = ops.convert_to_tensor(unscale) grads = [] for grad in grads_scaled: if grad is not None: with _maybe_colocate_with(grad.op, gradient_uid, colocate_gradients_with_ops): grad = math_ops.scalar_mul( math_ops.cast(unscale_ts, dtype=grad.dtype), grad) grads.append(grad) else: grads = grads_scaled # if auto scaling: check nan and inf if mp_config.get('auto'): # check the grads grad_has_nans, grad_amax = AutomaticLossScaler.check_grads(grads) # the gradients will be ignored in the following two cases: # 1) there is Nan in the gradients; # 2) the maximum value is infinity should_skip_update = math_ops.logical_or(math_ops.is_inf(grad_amax), grad_has_nans) loss_scale_update_op = mp_config['auto'].update_op(grad_has_nans, grad_amax) grads_update = [] with ops.control_dependencies([loss_scale_update_op]): for grad in grads: if grad is not None: with _maybe_colocate_with(grad.op, gradient_uid, colocate_gradients_with_ops): grad_zero = _zero_grad(grad) grad = control_flow_ops.cond(should_skip_update, lambda: grad_zero, lambda: grad) grads_update.append(grad) return grads_update return grads