def CreateVariables(self): """Create variables for this layer and child layers. DO NOT OVERRIDE. Override self._CreateVariables instead. """ if self._create_variables_called: return self._create_variables_called = True self._global_step = py_utils.GetGlobalStep() if self._is_variable_free: for child in self._children_list: if not child._is_variable_free: # pylint: disable=protected-access raise ValueError( 'Variable free layer %s(%s) child %s(%s) has variables.' % (self.params.name, self.params.cls, child.params.name, child.params.cls)) else: self.AddExtraTheta('global_step', self._global_step) self._CreateChildrenVariables() with tf.variable_scope( py_utils.SanitizeScopeKey(self.params.name), auxiliary_name_scope=False): for name, meta in list(self._variables_to_create.items()): self._CreateVariable(name, meta) self._CreateVariables() self._VerifyVarsAndTheta()
def InstantiateVariables(self): """Create variables for this layer and child layers. DO NOT OVERRIDE. Override self._CreateLayerVariables instead. """ if self._create_variables_status != _CreateLayerVariablesStatus.NOT_CALLED: return self._create_variables_status = _CreateLayerVariablesStatus.IN_PROGRESS stack_size = len(_CREATE_VARIABLES_STACK.stack) _CREATE_VARIABLES_STACK.stack.append(self) try: self._global_step = py_utils.GetGlobalStep() self._CreateChildrenVariables() if not self._is_variable_free: self.AddExtraTheta('global_step', self._global_step) with tf.variable_scope( py_utils.SanitizeScopeKey(self.params.name), auxiliary_name_scope=False): for name, meta in list(self._variables_to_create.items()): self._CreateVariableInternal(name, meta) self._CreateLayerVariables() finally: assert _CREATE_VARIABLES_STACK.stack[-1] is self _CREATE_VARIABLES_STACK.stack.pop() assert len(_CREATE_VARIABLES_STACK.stack) == stack_size self._create_variables_status = _CreateLayerVariablesStatus.COMPLETED if not _CREATE_VARIABLES_STACK.stack: # Outermost layer just finished InstantiateVariables. self._VerifyVarsAndTheta()
def _SelfVariableScope(self): """Internal. Used to ensure the same variable & name scopes are used.""" if not self._self_variable_scope: with tf.variable_scope(py_utils.SanitizeScopeKey( self.params.name)) as scope: self._self_variable_scope = scope with contextlib.ExitStack() as stack: stack.enter_context( tf.variable_scope(self._self_variable_scope, auxiliary_name_scope=False)) stack.enter_context( tf.name_scope(self._self_variable_scope.original_name_scope)) yield stack
def CollectVarHistogram(vs_gs): """Adds histogram summaries for variables and gradients.""" for name, (var, grad) in vs_gs.FlattenItems(): name = py_utils.SanitizeScopeKey(name) with tf.device(var.device), tf.name_scope(name + '/summary'): if isinstance(grad, tf.IndexedSlices): var = tf.gather(var, grad.indices) grad = grad.values if var.dtype.is_complex: var = tf.abs(var) grad = tf.abs(grad) histogram('var_hist/' + name, var) histogram('grad_hist/' + name, grad)
def _SelfVariableScope(self, params=None, enter_name_scope=True): """Internal. Used to ensure the same variable & name scopes are used.""" if not hasattr(self, '_self_variable_scope'): params = params or self.params self._parent_variable_scope = tf.get_variable_scope() with tf.variable_scope(py_utils.SanitizeScopeKey( params.name)) as scope: self._self_variable_scope = scope with contextlib.ExitStack() as stack: stack.enter_context( tf.variable_scope(self._self_variable_scope, auxiliary_name_scope=False)) if enter_name_scope: stack.enter_context( tf.name_scope( self._self_variable_scope.original_name_scope)) yield stack
def _CreateChildrenVariables(self): """Create variables for child layers. Should be rarely overridden, only in cases when control over the context of children CreateVariables calls are needed. eg, if children variables need to be created inside of a specific context manager. There are a few cases of this in the codebase marked as for backwards compability. This is only to ensure that variable scopes remain compatible through the code migration. New layers should not copy that pattern, and instead follow the standard pattern of self.CreateChild() in __init__() and self.CreateVariable() in _CreateVariables(). If you are okay with breaking old checkpoints, you can go ahead and delete those functions. """ with tf.variable_scope( py_utils.SanitizeScopeKey(self.params.name), auxiliary_name_scope=False): for _ in self._children_list: # For now each layer is responsible for calling its CreateVariables. pass
def _CreateChildrenVariables(self): """Create variables for child layers. Should be rarely overridden, only in cases when control over the context of children InstantiateVariables calls are needed. eg, if children variables need to be created inside of a specific context manager. There are a few cases of this in the codebase marked as for backwards compability. This is only to ensure that variable scopes remain compatible through the code migration. New layers should not copy that pattern, and instead follow the standard pattern of self.CreateChild() in __init__() and self.CreateVariable() in _CreateLayerVariables(). If you are okay with breaking old checkpoints, you can go ahead and delete those functions. """ with tf.variable_scope(py_utils.SanitizeScopeKey(self.params.name), auxiliary_name_scope=False): for child in self._children_list: if self._is_variable_free and not child._is_variable_free: # pylint: disable=protected-access raise ValueError( 'Variable free layer %s(%s) child %s(%s) has variables.' % (self.params.name, self.params.cls, child.params.name, child.params.cls)) child.InstantiateVariables()
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary( py_utils.SanitizeScopeKey(name) + '/' + p.name, vg) flatten = py_utils.Flatten(var_grads) all_grad_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) all_var_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) self._AddEvalMetric('has_nan_or_inf', has_nan_or_inf, tf.constant(1.0)) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormClipping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.final_var_grads = final_var_grads return return_values