def testClipByNormClipped(self): # Norm clipping when clip_norm < 5 with self.session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) # Norm of x = sqrt(3^2 + 4^2) = 5 np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]] clip_norm = 4.0 ans = clip_ops.clip_by_norm(x, clip_norm) tf_ans = self.evaluate(ans) ans = clip_ops.clip_by_norm(x, clip_norm) tf_ans_tensor = self.evaluate(ans) self.assertAllClose(np_ans, tf_ans) self.assertAllClose(np_ans, tf_ans_tensor)
def testClipByNormBadShape(self): with self.test_session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1]) # Use a nonsensical shape. clip = constant_op.constant([1.0, 2.0]) with self.assertRaises(ValueError): _ = clip_ops.clip_by_norm(x, clip)
def _testClipByNorm(self, inputs, max_norm, expected): with self.test_session() as sess: input_op = constant_op.constant(inputs) clipped = clip_ops.clip_by_norm(input_op, max_norm) check_op = numerics.add_check_numerics_ops() result, _ = sess.run([clipped, check_op]) self.assertAllClose(result, expected)
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ params = nest.flatten(params) with backend.get_graph().as_default(): grads = gradients.gradients(loss, params) for grad, param in zip(grads, params): if grad is None: raise ValueError("Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ loss = self._scale_loss(loss) grads = gradients.gradients(loss, params) if None in grads: raise ValueError("An operation has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.") if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def _clip_dense(self, var): with self._maybe_colocate_with(var): updated_var_value = array_ops.identity(var.ref()) normalized_var = clip_ops.clip_by_norm( updated_var_value, self._max_norm, self._vars_to_clip_dims[var]) delta = updated_var_value - normalized_var with ops.colocate_with(var): return var.assign_sub(delta, use_locking=self._use_locking)
def maybe_normalize(x): if max_norm is not None: if x.get_shape().ndims is not None: ndims = x.get_shape().ndims else: ndims = array_ops.size(array_ops.shape(x)) return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims))) return x
def _testClipIndexedSlicesByNorm(self, values, indices, shape, max_norm, axes): with self.cached_session() as sess: values = constant_op.constant(values) indices = constant_op.constant(indices) shape = constant_op.constant(shape) # IndexedSlices mode indixed_slices = ops.IndexedSlices(values, indices, shape) clipped = clip_ops.clip_by_norm(indixed_slices, max_norm, axes) # clipped should be IndexedSlices self.assertIsInstance(clipped, ops.IndexedSlices) clipped = ops.convert_to_tensor(clipped) # Tensor mode dense_tensor = ops.convert_to_tensor(indixed_slices) dense_clipped = clip_ops.clip_by_norm(dense_tensor, max_norm, axes) result, expected = sess.run([clipped, dense_clipped]) self.assertAllClose(result, expected)
def testClipByNormNotClipped(self): # No norm clipping when clip_norm >= 5 with self.test_session(): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) # Norm of x = sqrt(3^2 + 4^2) = 5 np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]] clip_norm = 6.0 ans = clip_ops.clip_by_norm(x, clip_norm) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def testClipByNormClippedWithDim1(self): # Norm clipping when clip_norm < 5 with self.session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3]) # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5 np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]] clip_norm = 4.0 ans = clip_ops.clip_by_norm(x, clip_norm, [1]) tf_ans = self.evaluate(ans) self.assertAllClose(np_ans, tf_ans)
def testClipByNormZero(self): # No norm clipping when norm = 0 with self.test_session(use_gpu=True): x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3]) # Norm = 0, no changes np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] clip_norm = 6.0 ans = clip_ops.clip_by_norm(x, clip_norm) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def clip_gradient_norms(gradients_to_variables, max_norm): """Clips the gradients by the given value. Args: gradients_to_variables: A list of gradient to variable pairs (tuples). max_norm: the maximum norm value. Returns: A list of clipped gradient to variable pairs. """ clipped_grads_and_vars = [] for grad, var in gradients_to_variables: if grad is not None: if isinstance(grad, ops.IndexedSlices): tmp = clip_ops.clip_by_norm(grad.values, max_norm) grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: grad = clip_ops.clip_by_norm(grad, max_norm) clipped_grads_and_vars.append((grad, var)) return clipped_grads_and_vars
def testClipByNormNotClippedWithAxes(self): # No norm clipping when clip_norm >= 5 with self.test_session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3]) # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5 np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]] clip_norm = 6.0 ans = clip_ops.clip_by_norm(x, clip_norm, [1]) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def testClipByNormClippedWithDim0(self): # Norm clipping when clip_norm < 5 with self.test_session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3]) # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3 np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]] clip_norm = 4.0 ans = clip_ops.clip_by_norm(x, clip_norm, [0]) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def _compute_gradients(self, loss, var_list, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A callable taking no arguments which returns the value to minimize. var_list: list or tuple of `Variable` objects to update to minimize `loss`, or a callable returning the list or tuple of `Variable` objects. Use callable when the variable list would otherwise be incomplete before `minimize` and the variables are created at the first time when `loss` is called. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid, or var_list is None. """ # TODO(josh11b): Test that we handle weight decay in a reasonable way. with backprop.GradientTape() as tape: if not callable(var_list): tape.watch(var_list) loss_value = loss() if callable(var_list): var_list = var_list() var_list = nest.flatten(var_list) grads = tape.gradient(loss_value, var_list, grad_loss) if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def testClipByAverageNormReplacedWithClipByNorm(self): # Check clip_by_average_norm(t) is the same as # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t))) with self.session(use_gpu=True): x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333 # expected answer [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]] clip_norm = constant_op.constant(0.8) with_norm = clip_ops.clip_by_average_norm(x, clip_norm) without_norm = clip_ops.clip_by_norm( x, clip_norm * math_ops.to_float(array_ops.size(x))) clip_by_average_norm_ans = self.evaluate(with_norm) clip_by_norm_ans = self.evaluate(without_norm) self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
def maybe_normalize(x): """Normalizes the embeddings in x if max_norm is not None.""" if max_norm is None: return x static = True ids_rank = ops.convert_to_tensor(ids).get_shape().ndims if ids_rank is None: ids_rank = array_ops.rank(ids) static = False x_rank = x.get_shape().ndims if x_rank is None: x_rank = array_ops.rank(x) static = False return clip_ops.clip_by_norm( x, max_norm, axes=list(range(ids_rank, x_rank)) if static else math_ops.range(ids_rank, x_rank))
def _clip_sparse(self, grad, var): assert isinstance(grad, ops.IndexedSlices) clip_dims = self._vars_to_clip_dims[var] if 0 in clip_dims: logging.warning("Clipping norm across dims %s for %s is inefficient " "when including sparse dimension 0.", clip_dims, var.op.name) return self._clip_dense(var) with ops.colocate_with(var): var_subset = array_ops.gather(var.ref(), grad.indices) with self._maybe_colocate_with(var): normalized_var_subset = clip_ops.clip_by_norm( var_subset, self._max_norm, clip_dims) delta = ops.IndexedSlices( var_subset - normalized_var_subset, grad.indices, grad.dense_shape) with ops.colocate_with(var): return var.scatter_sub(delta, use_locking=self._use_locking)
def _clip_gradients(self, grads): """Clip gradients according to the clipnorm and clipvalue attributes.""" if self.clipnorm is not None: if distribute_ctx.has_strategy(): raise ValueError("Gradient clipping in the optimizer " "(by setting clipnorm or clipvalue) is currently " "unsupported when using a distribution strategy.") grads = [None if g is None else clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if self.clipvalue is not None: if distribute_ctx.has_strategy(): raise ValueError("Gradient clipping in the optimizer " "(by setting clipnorm or clipvalue) is currently " "unsupported when using a distribution strategy.") v = self.clipvalue grads = [ None if g is None else clip_ops.clip_by_value(g, -v, v) for g in grads ] return grads
def _clip(params, ids, max_norm): def _rank(x): rank = ops.convert_to_tensor(x).get_shape().ndims if rank: return rank, True else: return array_ops.rank(x), False if max_norm is None: return params ids_rank, ids_static = _rank(ids) params_rank, params_static = _rank(params) return clip_ops.clip_by_norm( params, max_norm, axes=(list(range(ids_rank, params_rank)) if ids_static and params_static else math_ops.range( ids_rank, params_rank)), )
def _clip(params, ids, max_norm): """Helper function for _embedding_lookup_and_transform. This function optionally clips embeddings to an l2-norm of max_norm. Args: params: A `Tensor` of embeddings retrieved by `gather`. ids: The `ids` argument that was passed to `gather`. max_norm: If provided, the embeddings are l2-normalized to the value of max_norm. Returns: A `Tensor` with the same type as `params`. """ def _rank(x): """Helper function to retrieve the rank of a tensor. Args: x: Something convertible to `Tensor`. Returns: Either a pair `(rank, True)` where `rank` is an integer or a pair `(rank, False)` where `rank` is an integer `Tensor`. In either case, `rank` is the rank of `x`. """ rank = ops.convert_to_tensor(x).get_shape().ndims if rank: return rank, True else: return array_ops.rank(x), False if max_norm is None: return params ids_rank, ids_static = _rank(ids) params_rank, params_static = _rank(params) return clip_ops.clip_by_norm( params, max_norm, axes=(list(range(ids_rank, params_rank)) if ids_static and params_static else math_ops.range(ids_rank, params_rank)))
def _clip(params, ids, max_norm): """Helper function for _embedding_lookup_and_transform. This function optionally clips embeddings to an l2-norm of max_norm. Args: params: A `Tensor` of embeddings retrieved by `gather`. ids: The `ids` argument that was passed to `gather`. max_norm: If not `None`, each embedding is clipped if its l2-norm is larger than this value. Returns: A `Tensor` with the same type as `params`. """ def _rank(x): """Helper function to retrieve the rank of a tensor. Args: x: Something convertible to `Tensor`. Returns: Either a pair `(rank, True)` where `rank` is an integer or a pair `(rank, False)` where `rank` is an integer `Tensor`. In either case, `rank` is the rank of `x`. """ rank = ops.convert_to_tensor(x).get_shape().ndims if rank: return rank, True else: return array_ops.rank(x), False if max_norm is None: return params ids_rank, ids_static = _rank(ids) params_rank, params_static = _rank(params) return clip_ops.clip_by_norm( params, max_norm, axes=(list(range(ids_rank, params_rank)) if ids_static and params_static else math_ops.range(ids_rank, params_rank)))
def get_gradients_for_keras(optimizer, loss, params): from tensorflow.python.util import nest from tensorflow.python.keras import backend from tensorflow.python.ops import gradients from tensorflow.python.ops import clip_ops from tensorflow.python.keras.optimizers import TFOptimizer params = nest.flatten(params) if isinstance(optimizer, TFOptimizer): scope_name = optimizer.optimizer._name else: scope_name = optimizer._name with backend.get_graph().as_default(), backend.name_scope(scope_name + "/gradients"): grads = gradients.gradients(loss, params) all_reduced_grads = [] for grad, param in zip(grads, params): if grad is None: raise ValueError("Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) grad = process_grad(grad) with tf.control_dependencies([param]): grad_i = tf.identity(grad, name="zoo_identity_op_for_grad") all_reduced_grads.append(grad_i) grads = all_reduced_grads if hasattr(optimizer, "clipnorm"): grads = [clip_ops.clip_by_norm(g, optimizer.clipnorm) for g in grads] if hasattr(optimizer, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -optimizer.clipvalue, optimizer.clipvalue) for g in grads ] return grads
def train_one_sample(self, space_sample, reward): self.reset() with tf.GradientTape() as tape: self.reset() self.calc_log_prob(space_sample) if self.entropy_weight is not None: self.reward += self.entropy_weight * self.entropy self.baseline = self.baseline * self.baseline_decay + reward * ( 1 - self.baseline_decay) loss = self.log_prob * (reward - self.baseline) print(f'Reward: {reward}, Loss: {loss}') # loss += skip_weight * self.sample_skip_penalty grads = tape.gradient(loss, self.trainable_variables) if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) return loss
def _gather_and_clip(params, ids, max_norm, name=None): """Helper function for _embedding_lookup_and_transform. This function gathers embeddings from a single tensor. The gather deals with resource variables specially. The embeddings are clipped to an l2-norm of max_norm if provided. Args: params: A `Tensor` of embeddings. ids: A `Tensor` indexing the embeddings to be retrieved from `params`. max_norm: If provided, embedding values are l2-normalized to the value of max_norm. name: A name for the operation (optional). Returns: A `Tensor` with the same type as `params`. """ if isinstance(params, resource_variable_ops.ResourceVariable): embs = params.sparse_read(ids, name=name) else: embs = array_ops.gather(params, ids, name=name) if max_norm is None: return embs static = True ids_rank = ops.convert_to_tensor(ids).get_shape().ndims if ids_rank is None: ids_rank = array_ops.rank(ids) static = False embs_rank = embs.get_shape().ndims if embs_rank is None: embs_rank = array_ops.rank(embs) static = False return clip_ops.clip_by_norm( embs, max_norm, axes=list(range(ids_rank, embs_rank)) if static else math_ops.range(ids_rank, embs_rank))
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ params = nest.flatten(params) with backend.get_graph().as_default(), backend.name_scope( self._name + "/gradients"): grads = gradients.gradients(loss, params) for grad, param in zip(grads, params): if grad is None: raise ValueError( "Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) if hasattr(self, "clipnorm"): grads = [ clip_ops.clip_by_norm(g, self.clipnorm) for g in grads ] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def _resource_apply_dense(self, grad, var): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.compat.v1.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) v = self.get_slot(var, "v") if self.clip_gradients: clipVal = math_ops.sqrt( tf.reduce_sum(input_tensor=v) / (1.0 - beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t grad = clip_ops.clip_by_norm(grad, clipVal) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t grad_corr = grad / v_corr_t m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad_corr, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta1_power) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.compat.v1.where(sma_t >= 5.0, r_t * m_corr_t, m_corr_t) if var in self.reg_vars: if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var if self._L1_decay > 0.0: var_t += math_ops.cast( self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var) with tf.control_dependencies([var_t]): var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] return control_flow_ops.group(*updates)
def testClipByNormGradientZeros(self): with self.session(use_gpu=True): x = array_ops.zeros([3]) b = clip_ops.clip_by_norm(x, 1.) grad, = gradients_impl.gradients(b, x) self.assertAllEqual(grad, [1., 1., 1.])
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None, stop_gradients=None, scale_loss_by_num_towers=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A callable taking no arguments which returns the value to minimize. var_list: list or tuple of `Variable` objects to update to minimize `loss`, or a callable returning the list or tuple of `Variable` objects. Use callable when the variable list would otherwise be incomplete before `minimize` and the variables are created at the first time when `loss` is called. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. stop_gradients: Optional. A Tensor or list of tensors not to differentiate through. scale_loss_by_num_towers: Optional boolean. If true, scale the loss down by the number of towers. By default, auto-detects whether this is needed. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and `loss` is not callable. @compatibility(eager) When eager execution is enabled, `gate_gradients`, `aggregation_method`, and `colocate_gradients_with_ops` are ignored. @end_compatibility """ # TODO(josh11b): Test that we handle weight decay in a reasonable way. with backprop.GradientTape() as tape: if not callable(var_list): tape.watch(var_list) loss_value = loss() if callable(var_list): var_list = var_list() var_list = nest.flatten(var_list) grads = tape.gradient(loss_value, var_list, grad_loss) if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource]) return grads_and_vars
def _testClipTensorByNorm(self, inputs, max_norm, expected): input_op = constant_op.constant(inputs) clipped = clip_ops.clip_by_norm(input_op, max_norm) check_op = numerics.add_check_numerics_ops() result, _ = self.evaluate([clipped, check_op]) self.assertAllClose(result, expected)
def _apply_sparse_shared(self, grad, var, indices, scatter_add): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.compat.v1.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) v = self.get_slot(var, "v") if self.clip_gradients: clipVal = math_ops.sqrt( tf.reduce_sum(input_tensor=v) / (1.0 - beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t grad = clip_ops.clip_by_norm(grad, clipVal) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) m_corr_t = m_t / (1.0 - beta1_power) v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) if self._amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power)) + epsilon_t else: v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.compat.v1.where(sma_t >= 5.0, r_t * m_corr_t / v_corr_t, m_corr_t) if var in self.reg_vars: if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var if self._L1_decay > 0.0: var_t += math_ops.cast( self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var) var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self._amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def _clip_gradients_seperate_norm(grads_and_vars, clip_gradients): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) clipped_gradients = [clip_ops.clip_by_norm(grad, clip_gradients) for grad in gradients] return list(zip(clipped_gradients, variables))
def build_multi_tower_graph(images, sketches, images_d, image_paired_class_ids, image_paired_class_ids_d, text_vocab_indiceses, LSTM_hybrid, vocab_size, batch_size, num_gpu, batch_portion, training, learning_rates, counter, max_iter_step, ld=10, data_format='NCHW', distance_map=True, optimizer='Adam', block_type='MRU'): """ :param images: [batch_size, 3, H, W] :param sketches: [batch_size, 3, H, W] :param images_d: [batch_size, 3, H, W] :param image_paired_class_ids: [batch_size, ], class_number :param image_paired_class_ids_d: [batch_size, ] :param text_vocab_indiceses: [batch_size, 15] :return: """ models.set_param(data_format=data_format) with tf.device('/cpu:0'): images_list = split_inputs(images, batch_size, batch_portion, num_gpu) # [num_gpu, [N, C, H, W]] images_d_list = split_inputs(images_d, batch_size, batch_portion, num_gpu) sketches_list = split_inputs(sketches, batch_size, batch_portion, num_gpu) image_paired_class_ids_list = split_inputs(image_paired_class_ids, batch_size, batch_portion, num_gpu) image_paired_class_ids_d_list = split_inputs(image_paired_class_ids_d, batch_size, batch_portion, num_gpu) text_vocab_indiceses_list = split_inputs(text_vocab_indiceses, batch_size, batch_portion, num_gpu) lr_g = learning_rates['generator'] lr_d = learning_rates['discriminator'] optimizer = get_optimizer(optimizer) decay = tf.maximum( 0.2, 1. - (tf.cast(counter, tf.float32) / max_iter_step * 0.9)) tf.summary.scalar('learning_rate_g', lr_g * decay) optim_g = optimizer(learning_rate=lr_g * decay) optim_d = optimizer(learning_rate=lr_d * decay) tower_grads_g = [] tower_grads_d = [] for i in range(num_gpu): with tf.name_scope('%s_%d' % ('GPU', i)) as scope: loss_g, loss_d, grad_g, grad_d \ = build_single_graph(images_list[i], sketches_list[i], images_d_list[i], image_paired_class_ids_list[i], image_paired_class_ids_d_list[i], text_vocab_indiceses_list[i], batch_size * batch_portion[i], training, LSTM_hybrid=LSTM_hybrid, vocab_size=vocab_size, ld=ld, data_format=data_format, distance_map=distance_map, optim_g=optim_g, optim_d=optim_d, block_type=block_type) tower_grads_g.append(grad_g) tower_grads_d.append(grad_d) assert len(tower_grads_g) == len(tower_grads_d) if len(tower_grads_d) == 1: ave_grad_g = grad_g ave_grad_d = grad_d else: ave_grad_g, ave_grad_d = average_gradients( (tower_grads_g, tower_grads_d)) # Apply gradients tf.get_variable_scope( )._reuse = False # Hack to force initialization of optimizer variables if Config.sn: # Get the update ops spectral_norm_update_ops = tf.get_collection( Config.SPECTRAL_NORM_UPDATE_OPS) else: spectral_norm_update_ops = [tf.no_op()] assign_ops = tf.no_op() # Clip gradients if using WGAN/DRAGAN global_grad_norm_G = None global_grad_norm_G_clipped = None global_grad_norm_D = None global_grad_norm_D_clipped = None if not Config.sn: max_grad_norm_G = 50. max_grad_norm_D = 100. hard_clip_norm_G = 5. hard_clip_norm_D = 10. ave_grad_g_tensors, ave_grad_g_vars = list(zip(*ave_grad_g)) global_grad_norm_G = clip_ops.global_norm(ave_grad_g_tensors) ave_grad_g_tensors, _ = clip_ops.clip_by_global_norm( ave_grad_g_tensors, max_grad_norm_G, global_grad_norm_G) ave_grad_g_tensors = [ clip_ops.clip_by_norm(t, hard_clip_norm_G) for t in ave_grad_g_tensors ] ave_grad_g = list(zip(ave_grad_g_tensors, ave_grad_g_vars)) ave_grad_d_tensors, ave_grad_d_vars = list(zip(*ave_grad_d)) global_grad_norm_D = clip_ops.global_norm(ave_grad_d_tensors) ave_grad_d_tensors, _ = clip_ops.clip_by_global_norm( ave_grad_d_tensors, max_grad_norm_D, global_grad_norm_D) ave_grad_d_tensors = [ clip_ops.clip_by_norm(t, hard_clip_norm_D) for t in ave_grad_d_tensors ] ave_grad_d = list(zip(ave_grad_d_tensors, ave_grad_d_vars)) with tf.control_dependencies(spectral_norm_update_ops): opt_g = optimize(ave_grad_g, optim_g, None, 'gradient_norm', global_norm=global_grad_norm_G, global_norm_clipped=global_grad_norm_G_clipped, appendix='_G') opt_d = optimize(ave_grad_d, optim_d, None, 'gradient_norm', global_norm=global_grad_norm_D, global_norm_clipped=global_grad_norm_D_clipped, appendix='_D') summaries = gather_summaries() loss_g, loss_d = gather_losses() # Generator output from last tower return opt_g, opt_d, loss_g, loss_d, summaries
def testClipByNormGradientZeros(self): with self.session(use_gpu=True): x = array_ops.zeros([3]) b = clip_ops.clip_by_norm(x, 1.) grad, = gradients_impl.gradients(b, x) self.assertAllEqual(grad.eval(), [1., 1., 1.])