def _assign_func(self, *args, **kwargs): f = kwargs.pop("f") if distribution_strategy_context.get_cross_tower_context(): update_device = distribute_lib.get_update_device() if update_device is not None: # We are calling an assign function in an update context. return f(self._v, *args, **kwargs) # We are calling an assign function in cross tower context, wrap it in an # update call. return distribution_strategy_context.get_distribution_strategy().update( self, f, *args, **kwargs) else: assert distribution_strategy_context.get_tower_context() # We are calling an assign function in tower context. # We reduce the value we want to assign/add/sub. More details about how we # handle the different use cases can be found in the _reduce method. # We call the function with the reduced value. if self._aggregation == vs.VariableAggregation.NONE: raise ValueError("You must specify an aggregation method to update a " "a variable in Tower Context.") def merge_fn(strategy, value, *other_args, **other_kwargs): return strategy.update( self, f, strategy.reduce( aggregation=self._aggregation, value=value, destinations=self), *other_args, **other_kwargs) return distribution_strategy_context.get_tower_context().merge_call( merge_fn, *args, **kwargs)
def _assign_func(self, *args, **kwargs): f = kwargs.pop("f") if distribution_strategy_context.get_cross_tower_context(): update_device = distribute_lib.get_update_device() if update_device is not None: # We are calling an assign function in an update context. return f(self._v, *args, **kwargs) # We are calling an assign function in cross tower context, wrap it in an # update call. return distribution_strategy_context.get_distribution_strategy( ).update(self, f, *args, **kwargs) else: assert distribution_strategy_context.get_tower_context() # We are calling an assign function in tower context. # We reduce the value we want to assign/add/sub. More details about how we # handle the different use cases can be found in the _reduce method. # We call the function with the reduced value. if self._aggregation == vs.VariableAggregation.NONE: raise ValueError( "You must specify an aggregation method to update a " "a variable in Tower Context.") def merge_fn(strategy, value, *other_args, **other_kwargs): return strategy.update( self, f, strategy.reduce(aggregation=self._aggregation, value=value, destinations=self), *other_args, **other_kwargs) return distribution_strategy_context.get_tower_context( ).merge_call(merge_fn, *args, **kwargs)
def _assert_in_default_state(t): t.assertIs(distribution_strategy_context._get_default_tower_context(), distribution_strategy_context.get_tower_context()) t.assertIs(None, distribution_strategy_context.get_cross_tower_context()) t.assertIs(distribution_strategy_context._get_default_distribution_strategy(), distribution_strategy_context.get_distribution_strategy()) t.assertFalse(distribution_strategy_context.has_distribution_strategy())
def vq_discrete_bottleneck(x, hparams): """Simple vector quantized discrete bottleneck.""" bottleneck_size = 2**hparams.bottleneck_bits x_shape = commons.shape_list(x) x = tf.reshape(x, [-1, hparams.hidden_size]) x_means_hot, e_loss = vq_nearest_neighbor(x, hparams) if hparams.bottleneck_kind == "mog": loss = hparams.beta * e_loss else: tf.logging.info("Using EMA with beta = {}".format(hparams.beta)) means, ema_means, ema_count = (hparams.means, hparams.ema_means, hparams.ema_count) # Update the ema variables updated_ema_count = commons.assign_moving_average(ema_count, tf.reduce_sum( x_means_hot, axis=0), hparams.decay, zero_debias=False) dw = tf.matmul(x_means_hot, x, transpose_a=True) updated_ema_means = commons.assign_moving_average(ema_means, dw, hparams.decay, zero_debias=False) n = tf.reduce_sum(updated_ema_count, axis=-1, keepdims=True) updated_ema_count = ((updated_ema_count + hparams.epsilon) / (n + bottleneck_size * hparams.epsilon) * n) # pylint: disable=g-no-augmented-assignment updated_ema_means = updated_ema_means / tf.expand_dims( updated_ema_count, axis=-1) # pylint: enable=g-no-augmented-assignment with tf.control_dependencies([e_loss]): # distribution_strategy def update_fn(v, value): return tf.assign(v, value) tower_context = distribution_strategy_context.get_tower_context() if tower_context: def merge_fn(strategy, v, value): value = strategy.reduce(tf.VariableAggregation.MEAN, value, v) return strategy.update(v, update_fn, value) update_means = tower_context.merge_call( merge_fn, means, updated_ema_means) else: strategy = distribution_strategy_context.get_cross_tower_context( ) update_means = strategy.update(means, update_fn, updated_ema_means) with tf.control_dependencies([update_means]): loss = hparams.beta * e_loss discrete = tf.reshape(x_means_hot, x_shape[:-1] + [bottleneck_size]) return discrete, loss
def op(self): # We want cross-tower code that does some var.op.X calls # to work (even if the current device isn't in self.devices), but # other uses of var.op in a cross-tower context to fail. if distribution_strategy_context.get_cross_tower_context(): return DistributedVarOp(self._primary_var.op.name, self._primary_var.op.graph, self._primary_var.op.type) return self.get().op
def op(self): # We want cross-tower code that does some var.op.X calls # to work (even if the current device isn't in self.devices), but # other uses of var.op in a cross-tower context to fail. if distribution_strategy_context.get_cross_tower_context(): return DistributedVarOp(self._primary_var.op.name, self._primary_var.op.graph, self._primary_var.op.type) return self.get().op
def set_non_tensor_output(self, name, output): """Set `output` with `name` to be captured as a non tensor output.""" if distribution_strategy_context.get_cross_tower_context(): self._non_tensor_outputs[name] = output else: def merge_fn(distribution, value): # NOTE(priyag): For non tensor outputs, we simply return all the values # in a list as aggregation doesn't make sense on non tensors. self._non_tensor_outputs[name] = distribution.unwrap(value) distribution_strategy_context.get_tower_context().merge_call( merge_fn, output)
def set_non_tensor_output(self, name, output): """Set `output` with `name` to be captured as a non tensor output.""" if distribution_strategy_context.get_cross_tower_context(): self._non_tensor_outputs[name] = output else: def merge_fn(distribution, value): # NOTE(priyag): For non tensor outputs, we simply return all the values # in a list as aggregation doesn't make sense on non tensors. self._non_tensor_outputs[name] = distribution.unwrap(value) distribution_strategy_context.get_tower_context().merge_call( merge_fn, output)
def merge_fn(dist, s): self.assertIs( distribution_strategy_context._get_default_distribution_strategy(), dist) self.assertIs(None, distribution_strategy_context.get_tower_context()) self.assertIs(dist, distribution_strategy_context.get_cross_tower_context()) self.assertIs(dist, distribution_strategy_context.get_distribution_strategy()) self.assertFalse( distribution_strategy_context.has_distribution_strategy()) return "foo_" + s
def assign(self, *args, **kwargs): if distribution_strategy_context.get_cross_tower_context(): # To preserve the sum across save and restore, we have to divide the # total across all devices when restoring a variable that was summed # when saving. tensor = args[0] if self._aggregation == vs.VariableAggregation.SUM: tensor *= 1. / len(self.devices) return control_flow_ops.group( [_assign_on_device(d, v, tensor) for d, v in six.iteritems(self._index)]) else: _assert_tower_context() return self.get().assign(*args, **kwargs)
def run_fn(): tower_context = distribution_strategy_context.get_tower_context() self.assertTrue(tower_context is not None) self.assertIs(None, distribution_strategy_context.get_cross_tower_context()) self.assertTrue(distribution_strategy_context.has_distribution_strategy()) self.assertIs(dist, distribution_strategy_context.get_distribution_strategy()) self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo")) expected_value = _get_test_variable( "bar", variable_scope.VariableSynchronization.AUTO, variable_scope.VariableAggregation.NONE) self.assertDictEqual(expected_value, variable_scope.variable(1.0, name="bar"))
def _assign_func(self, *args, **kwargs): f = kwargs.pop("f") if distribution_strategy_context.get_cross_tower_context(): update_device = distribute_lib.get_update_device() if update_device is not None: # We are calling an assign function on the mirrored variable in an # update context. v = self.get(device=update_device) return f(v, *args, **kwargs) # We are calling assign on the mirrored variable in cross tower context, # use update to update the variable. strategy = distribution_strategy_context.get_distribution_strategy( ) updates = strategy.update(self, f, *args, **kwargs) grouped = strategy.group(updates) if isinstance(updates, DistributedValues) and updates.is_tensor_like: # Make sure we run all updates. Without this, something like # session.run(mirrored_var.assign*(...)) may only update one tower. index = {} for d in updates.devices: with ops.device(d), ops.control_dependencies([grouped]): index[d] = array_ops.identity(updates.get(d)) return Mirrored(index) else: return grouped else: _assert_tower_context() # We are calling an assign function on the mirrored variable in tower # context. # We reduce the value we want to assign/add/sub. More details about how we # handle the different use cases can be found in the _reduce method. # We call the function on each of the mirrored variables with the reduced # value. if self._aggregation == vs.VariableAggregation.NONE: raise ValueError( "You must specify an aggregation method to update a " "MirroredVariable in Tower Context.") def merge_fn(strategy, value, *other_args, **other_kwargs): return strategy.update( self, f, strategy.reduce(aggregation=self._aggregation, value=value, destinations=self), *other_args, **other_kwargs) return distribution_strategy_context.get_tower_context( ).merge_call(merge_fn, *args, **kwargs)
def assign(self, *args, **kwargs): if distribution_strategy_context.get_cross_tower_context(): # To preserve the sum across save and restore, we have to divide the # total across all devices when restoring a variable that was summed # when saving. tensor = args[0] if self._aggregation == vs.VariableAggregation.SUM: tensor *= 1. / len(self.devices) return control_flow_ops.group( [_assign_on_device(d, v, tensor) for d, v in six.iteritems(self._index)]) else: _assert_tower_context() return self.get().assign(*args, **kwargs)
def testScope(self): _assert_in_default_state(self) dist = _TestStrategy() with dist.scope(): self.assertIs(None, distribution_strategy_context.get_tower_context()) self.assertIs(dist, distribution_strategy_context.get_cross_tower_context()) self.assertTrue(distribution_strategy_context.has_distribution_strategy()) self.assertIs(dist, distribution_strategy_context.get_distribution_strategy()) expected_value = _get_test_variable( "baz", variable_scope.VariableSynchronization.AUTO, variable_scope.VariableAggregation.NONE) self.assertDictEqual(expected_value, variable_scope.variable(1.0, name="baz")) _assert_in_default_state(self)
def set_last_step_output( self, name, output, aggregation=variables_lib.VariableAggregation.NONE): """Set `output` with `name` to be outputted from the last step. Args: name: String, name to identify the output. Doesn't need to match tensor name. output: The tensors that should be outputted with `name`. See below for actual types supported. aggregation: Aggregation method to use to aggregate outputs from multiple towers. Required if `set_last_step_output` is called in a tower context. Optional in cross_tower_context. When present, the outputs from all the towers are aggregated using the current distribution strategy's `reduce` method. Hence, the type of `output` must be what's supported by the corresponding `reduce` method. For e.g. if using MirroredStrategy and aggregation is set, output must be a `PerDevice` value. The aggregation method is also recorded in a dictionary `_last_step_outputs_aggregations` for later interpreting of the outputs as already reduced or not. """ if distribution_strategy_context.get_cross_tower_context(): self._last_step_outputs_aggregations[name] = aggregation if aggregation is variables_lib.VariableAggregation.NONE: self._last_step_outputs[name] = output else: distribution = distribution_strategy_context.get_distribution_strategy( ) self._last_step_outputs[name] = distribution.reduce( aggregation, output, destinations="/device:CPU:0") else: assert aggregation is not variables_lib.VariableAggregation.NONE def merge_fn(distribution, value): self._last_step_outputs[name] = distribution.reduce( aggregation, value, destinations="/device:CPU:0") # Setting this inside the `merge_fn` because all towers share the same # context object, so it's more robust to set it only once (even if all # the towers are trying to set the same value). self._last_step_outputs_aggregations[name] = aggregation distribution_strategy_context.get_tower_context().merge_call( merge_fn, output)
def _assign_func(self, *args, **kwargs): f = kwargs.pop("f") if distribution_strategy_context.get_cross_tower_context(): update_device = distribute_lib.get_update_device() if update_device is not None: # We are calling an assign function on the mirrored variable in an # update context. v = self.get(device=update_device) return f(v, *args, **kwargs) # We are calling assign on the mirrored variable in cross tower context, # use update to update the variable. strategy = distribution_strategy_context.get_distribution_strategy() updates = strategy.update(self, f, *args, **kwargs) grouped = strategy.group(updates) if isinstance(updates, DistributedValues) and updates.is_tensor_like: # Make sure we run all updates. Without this, something like # session.run(mirrored_var.assign*(...)) may only update one tower. index = {} for d in updates.devices: with ops.device(d), ops.control_dependencies([grouped]): index[d] = array_ops.identity(updates.get(d)) return Mirrored(index) else: return grouped else: _assert_tower_context() # We are calling an assign function on the mirrored variable in tower # context. # We reduce the value we want to assign/add/sub. More details about how we # handle the different use cases can be found in the _reduce method. # We call the function on each of the mirrored variables with the reduced # value. if self._aggregation == vs.VariableAggregation.NONE: raise ValueError("You must specify an aggregation method to update a " "MirroredVariable in Tower Context.") def merge_fn(strategy, value, *other_args, **other_kwargs): return strategy.update( self, f, strategy.reduce( aggregation=self._aggregation, value=value, destinations=self), *other_args, **other_kwargs) return distribution_strategy_context.get_tower_context().merge_call( merge_fn, *args, **kwargs)
def set_last_step_output(self, name, output, aggregation=variables_lib.VariableAggregation.NONE): """Set `output` with `name` to be outputted from the last step. Args: name: String, name to identify the output. Doesn't need to match tensor name. output: The tensors that should be outputted with `name`. See below for actual types supported. aggregation: Aggregation method to use to aggregate outputs from multiple towers. Required if `set_last_step_output` is called in a tower context. Optional in cross_tower_context. When present, the outputs from all the towers are aggregated using the current distribution strategy's `reduce` method. Hence, the type of `output` must be what's supported by the corresponding `reduce` method. For e.g. if using MirroredStrategy and aggregation is set, output must be a `PerDevice` value. The aggregation method is also recorded in a dictionary `_last_step_outputs_aggregations` for later interpreting of the outputs as already reduced or not. """ if distribution_strategy_context.get_cross_tower_context(): self._last_step_outputs_aggregations[name] = aggregation if aggregation is variables_lib.VariableAggregation.NONE: self._last_step_outputs[name] = output else: distribution = distribution_strategy_context.get_distribution_strategy() self._last_step_outputs[name] = distribution.reduce( aggregation, output, destinations="/device:CPU:0") else: assert aggregation is not variables_lib.VariableAggregation.NONE def merge_fn(distribution, value): self._last_step_outputs[name] = distribution.reduce( aggregation, value, destinations="/device:CPU:0") # Setting this inside the `merge_fn` because all towers share the same # context object, so it's more robust to set it only once (even if all # the towers are trying to set the same value). self._last_step_outputs_aggregations[name] = aggregation distribution_strategy_context.get_tower_context().merge_call( merge_fn, output)
def assign_moving_average(variable, value, decay, zero_debias=True, name=None): """Compute the moving average of a variable. The moving average of 'variable' updated with 'value' is: variable * decay + value * (1 - decay) The returned Operation sets 'variable' to the newly computed moving average, by performing this subtraction: variable -= (1 - decay) * (variable - value) Since variables that are initialized to a `0` value will be `0` biased, `zero_debias` optionally enables scaling by the mathematically correct debiasing factor of 1 - decay ** num_updates See `ADAM: A Method for Stochastic Optimization` Section 3 for more details (https://arxiv.org/abs/1412.6980). The names of the debias shadow variables, by default, include both the scope they were created in and the scope of the variables they debias. They are also given a uniquifying-suffix. E.g.: ``` with tf.variable_scope('scope1'): with tf.variable_scope('scope2'): var = tf.get_variable('foo') update_1 = tf.assign_moving_average(var, 0.0, 1.0) update_2 = tf.assign_moving_average(var, 0.0, 0.9) # var.name: 'scope1/scope2/foo' # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased' # 'scope1/scope2/scope1/scope2/foo/biased_1' ``` Args: variable: A Variable. value: A tensor with the same shape as 'variable'. decay: A float Tensor or float value. The moving average decay. zero_debias: A python bool. If true, assume the variable is 0-initialized and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in `_zero_debias` for more details. name: Optional name of the returned operation. Returns: A tensor which if evaluated will compute and return the new moving average. """ def update_fn(v, value, decay=decay): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != v.dtype.base_dtype: decay = math_ops.cast(decay, v.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(v, value, decay) else: update_delta = (v - value) * decay return state_ops.assign_sub(v, update_delta, name=scope) with ops.name_scope(name, "AssignMovingAvg", [variable, value, decay]) as scope: tower_context = distribution_strategy_context.get_tower_context() if tower_context: # In a tower context, we update variable using the mean of value across # towers. def merge_fn(strategy, v, value): value = strategy.reduce( variable_scope.VariableAggregation.MEAN, value, v) return strategy.update(v, update_fn, value) return tower_context.merge_call(merge_fn, variable, value) else: strategy = distribution_strategy_context.get_cross_tower_context() return strategy.update(variable, update_fn, value)
def assign_moving_average(variable, value, decay, zero_debias=True, name=None): """Compute the moving average of a variable. https://github.com/tensorflow/tensorflow/blob/c966b5eed60a570f2121cb84ddb4ece84c413719/tensorflow/python/training/moving_averages.py """ def _zero_debias(unbiased_var, value, decay): """Compute the delta required for a debiased Variable. """ with tf.variable_scope(unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope: with tf.init_scope(): biased_initializer = tf.zeros_initializer( dtype=unbiased_var.dtype)(unbiased_var.get_shape()) local_step_initializer = tf.zeros_initializer() def _maybe_get_unique(name): """Get name for a unique variable, if not `reuse=True`.""" if tf.get_variable_scope().reuse: return name vs_vars = [ x.op.name for x in tf.get_variable_scope().global_variables() ] full_name = tf.get_variable_scope().name + "/" + name if full_name not in vs_vars: return name idx = 1 while full_name + ("_%d" % idx) in vs_vars: idx += 1 return name + ("_%d" % idx) biased_var = tf.get_variable(_maybe_get_unique("biased"), initializer=biased_initializer, trainable=False) local_step = tf.get_variable(_maybe_get_unique("local_step"), shape=[], dtype=unbiased_var.dtype, initializer=local_step_initializer, trainable=False) # Get an update ops for both shadow variables. update_biased = tf.assign_sub(biased_var, (biased_var - value) * decay, name=scope.name) update_local_step = local_step.assign_add(1) # Compute the value of the delta to update the unbiased EMA. Make sure to # use the new values of the biased variable and the local step. with tf.control_dependencies([update_biased, update_local_step]): # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. unbiased_ema_delta = ( unbiased_var - biased_var.read_value() / (1 - tf.pow(1.0 - decay, local_step.read_value()))) return unbiased_ema_delta def update_fn(v, value, decay=decay): decay = tf.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != v.dtype.base_dtype: decay = tf.cast(decay, v.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(v, value, decay) else: update_delta = (v - value) * decay return tf.assign_sub(v, update_delta, name=scope) with tf.name_scope(name, "AssignMovingAvg", [variable, value, decay]) as scope: tower_context = distribution_strategy_context.get_tower_context() if tower_context: # In a tower context, we update variable using the mean of value across # towers. def merge_fn(strategy, v, value): try: value = strategy.reduce(tf.VariableAggregation.MEAN, value, v) except: pass # Mirrored variables are loaded return strategy.update(v, update_fn, value) return tower_context.merge_call(merge_fn, variable, value) else: strategy = distribution_strategy_context.get_cross_tower_context() return strategy.update(variable, update_fn, value)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. RuntimeError: If you should use `_distributed_apply()` instead. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). # Handle DistributionStrategy case. if distribution_strategy_context.get_cross_tower_context(): raise RuntimeError("Use `_distributed_apply()` instead of " "`apply_gradients()` in a cross-tower context.") # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by # always calling _distributed_apply(), using the default distribution # as needed. if distribution_strategy_context.has_distribution_strategy(): grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)() return distribution_strategy_context.get_tower_context().merge_call( self._distributed_apply, grads_and_vars, global_step, name) # No DistributionStrategy case. grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for g, v in grads_and_vars: if g is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. g = ops.convert_to_tensor_or_indexed_slices(g) except TypeError: raise TypeError( "Gradient must be convertible to a Tensor" " or IndexedSlices, or None: %s" % g) if not isinstance(g, (ops.Tensor, ops.IndexedSlices)): raise TypeError( "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) p = _get_processor(v) converted_grads_and_vars.append((g, v, p)) converted_grads_and_vars = tuple(converted_grads_and_vars) var_list = [v for g, v, _ in converted_grads_and_vars if g is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(v) for _, _, v in converted_grads_and_vars],)) with ops.init_scope(): self._create_slots(var_list) update_ops = [] with ops.name_scope(name, self._name) as name: self._prepare() for grad, var, processor in converted_grads_and_vars: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. if context.executing_eagerly() or isinstance( var, resource_variable_ops.ResourceVariable) and not var._in_graph_mode: # pylint: disable=protected-access scope_name = "" else: scope_name = var.op.name with ops.name_scope("update_" + scope_name), ops.colocate_with(var): update_ops.append(processor.update_op(self, grad)) if global_step is None: apply_updates = self._finish(update_ops, name) else: with ops.control_dependencies([self._finish(update_ops, "update")]): with ops.colocate_with(global_step): if isinstance(global_step, resource_variable_ops.ResourceVariable): # TODO(apassos): the implicit read in assign_add is slow; consider # making it less so. apply_updates = resource_variable_ops.assign_add_variable_op( global_step.handle, ops.convert_to_tensor(1, dtype=global_step.dtype), name=name) else: apply_updates = state_ops.assign_add(global_step, 1, name=name) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) return apply_updates
def apply_gradients(self, loss, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. RuntimeError: If you should use `_distributed_apply()` instead. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). # Handle DistributionStrategy case. if distribution_strategy_context.get_cross_tower_context(): raise RuntimeError("Use `_distributed_apply()` instead of `apply_gradients()` in a cross-tower context.") # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by # always calling _distributed_apply(), using the default distribution # as needed. if distribution_strategy_context.has_distribution_strategy(): grads_and_vars = optimizer.get_filtered_grad_fn(lambda: grads_and_vars)() return distribution_strategy_context.get_tower_context().merge_call( self._distributed_apply, grads_and_vars, global_step, name ) # No DistributionStrategy case. grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for grad, var in grads_and_vars: if grad is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. grad = ops.convert_to_tensor_or_indexed_slices(grad) except TypeError: raise TypeError("Gradient must be convertible to a Tensor or IndexedSlices, or None: %s" % grad) if not isinstance(grad, (ops.Tensor, ops.IndexedSlices)): raise TypeError("Gradient must be a Tensor, IndexedSlices, or None: %s" % grad) processor = _get_processor(var) converted_grads_and_vars.append((grad, var, processor)) converted_grads_and_vars = tuple(converted_grads_and_vars) var_list = [var for grad, var, _ in converted_grads_and_vars if grad is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(var) for _, var, _ in converted_grads_and_vars],)) with ops.init_scope(): self._create_slots(var_list) update_ops = [] with ops.name_scope(name, self._name) as name: self._prepare() for grad, var, processor in converted_grads_and_vars: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. if context.executing_eagerly() or isinstance(var, resource_variable_ops.ResourceVariable) and not var._in_graph_mode: scope_name = "" else: scope_name = var.op.name with ops.name_scope("update_" + scope_name), ops.colocate_with(var): update_ops.append(processor.update_op(self, loss, grad, global_step)) if global_step is None: apply_updates = self._finish(update_ops, loss, name) else: with ops.control_dependencies([self._finish(update_ops, loss, "update")]): with ops.colocate_with(global_step): if isinstance(global_step, resource_variable_ops.ResourceVariable): # TODO(apassos): the implicit read in assign_add is slow; consider # making it less so. apply_updates = resource_variable_ops.assign_add_variable_op( resource=global_step.handle, value=ops.convert_to_tensor( value=1, dtype=global_step.dtype ), name=name ) else: apply_updates = state_ops.assign_add( ref=global_step, value=1, name=name ) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) return apply_updates
def assign_moving_average(variable, value, decay, zero_debias=True, name=None): """Compute the moving average of a variable. The moving average of 'variable' updated with 'value' is: variable * decay + value * (1 - decay) The returned Operation sets 'variable' to the newly computed moving average, by performing this subtraction: variable -= (1 - decay) * (variable - value) Since variables that are initialized to a `0` value will be `0` biased, `zero_debias` optionally enables scaling by the mathematically correct debiasing factor of 1 - decay ** num_updates See `ADAM: A Method for Stochastic Optimization` Section 3 for more details (https://arxiv.org/abs/1412.6980). The names of the debias shadow variables, by default, include both the scope they were created in and the scope of the variables they debias. They are also given a uniquifying-suffix. E.g.: ``` with tf.variable_scope('scope1'): with tf.variable_scope('scope2'): var = tf.get_variable('foo') update_1 = tf.assign_moving_average(var, 0.0, 1.0) update_2 = tf.assign_moving_average(var, 0.0, 0.9) # var.name: 'scope1/scope2/foo' # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased' # 'scope1/scope2/scope1/scope2/foo/biased_1' ``` Args: variable: A Variable. value: A tensor with the same shape as 'variable'. decay: A float Tensor or float value. The moving average decay. zero_debias: A python bool. If true, assume the variable is 0-initialized and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in `_zero_debias` for more details. name: Optional name of the returned operation. Returns: A tensor which if evaluated will compute and return the new moving average. """ def update_fn(v, value, decay=decay): decay = ops.convert_to_tensor(1.0 - decay, name="decay") if decay.dtype != v.dtype.base_dtype: decay = math_ops.cast(decay, v.dtype.base_dtype) if zero_debias: update_delta = _zero_debias(v, value, decay) else: update_delta = (v - value) * decay return state_ops.assign_sub(v, update_delta, name=scope) with ops.name_scope(name, "AssignMovingAvg", [variable, value, decay]) as scope: tower_context = distribution_strategy_context.get_tower_context() if tower_context: # In a tower context, we update variable using the mean of value across # towers. def merge_fn(strategy, v, value): value = strategy.reduce( variable_scope.VariableAggregation.MEAN, value, v) return strategy.update(v, update_fn, value) return tower_context.merge_call(merge_fn, variable, value) else: strategy = distribution_strategy_context.get_cross_tower_context() return strategy.update(variable, update_fn, value)
def init_from_checkpoint(ckpt_dir_or_file, assignment_map): """Initializes current variables with tensors loaded from given checkpoint. Note: This overrides default initialization ops of specified variables and redefines dtype. Assignment map supports following syntax: * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in current `scope_name` from `checkpoint_scope_name` with matching tensor names. * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` - will initialize `scope_name/variable_name` variable from `checkpoint_scope_name/some_other_variable`. * `'scope_variable_name': variable` - will initialize given `tf.Variable` object with tensor 'scope_variable_name' from the checkpoint. * `'scope_variable_name': list(variable)` - will initialize list of partitioned variables with tensor 'scope_variable_name' from the checkpoint. * `'/': 'scope_name/'` - will load all variables in current `scope_name` from checkpoint's root (e.g. no scope). Supports loading into partitioned variables, which are represented as `'<variable>/part_<part #>'`. Example: ```python # Say, '/tmp/model.ckpt' has the following tensors: # -- name='old_scope_1/var1', shape=[20, 2] # -- name='old_scope_1/var2', shape=[50, 4] # -- name='old_scope_2/var3', shape=[100, 100] # Create new model's variables with tf.variable_scope('new_scope_1'): var1 = tf.get_variable('var1', shape=[20, 2], initializer=tf.zeros_initializer()) with tf.variable_scope('new_scope_2'): var2 = tf.get_variable('var2', shape=[50, 4], initializer=tf.zeros_initializer()) # Partition into 5 variables along the first axis. var3 = tf.get_variable(name='var3', shape=[100, 100], initializer=tf.zeros_initializer(), partitioner=lambda shape, dtype: [5, 1]) # Initialize all variables in `new_scope_1` from `old_scope_1`. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'}) # Use names to specify which variables to initialize from checkpoint. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/var1': 'new_scope_1/var1', 'old_scope_1/var2': 'new_scope_2/var2'}) # Or use tf.Variable objects to identify what to initialize. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/var1': var1, 'old_scope_1/var2': var2}) # Initialize partitioned variables using variable's name init_from_checkpoint('/tmp/model.ckpt', {'old_scope_2/var3': 'new_scope_2/var3'}) # Or specify the list of tf.Variable objects. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_2/var3': var3._get_variable_list()}) ``` Args: ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint. assignment_map: Dict, where keys are names of the variables in the checkpoint and values are current variables or names of current variables (in default graph). Raises: tf.errors.OpError: If missing checkpoints or tensors in checkpoints. ValueError: If missing variables in current graph. """ if distribution_strategy_context.get_cross_tower_context(): _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map) else: distribution_strategy_context.get_tower_context().merge_call( _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
def _as_graph_element(self): # pylint: disable=protected-access if distribution_strategy_context.get_cross_tower_context(): return self._get_cross_tower() return self.get()._as_graph_element()
def _as_graph_element(self): # pylint: disable=protected-access if distribution_strategy_context.get_cross_tower_context(): return self._get_cross_tower() return self.get()._as_graph_element()
def init_from_checkpoint(ckpt_dir_or_file, assignment_map): """Initializes current variables with tensors loaded from given checkpoint. Note: This overrides default initialization ops of specified variables and redefines dtype. Assignment map supports following syntax: * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in current `scope_name` from `checkpoint_scope_name` with matching tensor names. * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` - will initialize `scope_name/variable_name` variable from `checkpoint_scope_name/some_other_variable`. * `'scope_variable_name': variable` - will initialize given `tf.Variable` object with tensor 'scope_variable_name' from the checkpoint. * `'scope_variable_name': list(variable)` - will initialize list of partitioned variables with tensor 'scope_variable_name' from the checkpoint. * `'/': 'scope_name/'` - will load all variables in current `scope_name` from checkpoint's root (e.g. no scope). Supports loading into partitioned variables, which are represented as `'<variable>/part_<part #>'`. Example: ```python # Say, '/tmp/model.ckpt' has the following tensors: # -- name='old_scope_1/var1', shape=[20, 2] # -- name='old_scope_1/var2', shape=[50, 4] # -- name='old_scope_2/var3', shape=[100, 100] # Create new model's variables with tf.variable_scope('new_scope_1'): var1 = tf.get_variable('var1', shape=[20, 2], initializer=tf.zeros_initializer()) with tf.variable_scope('new_scope_2'): var2 = tf.get_variable('var2', shape=[50, 4], initializer=tf.zeros_initializer()) # Partition into 5 variables along the first axis. var3 = tf.get_variable(name='var3', shape=[100, 100], initializer=tf.zeros_initializer(), partitioner=lambda shape, dtype: [5, 1]) # Initialize all variables in `new_scope_1` from `old_scope_1`. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'}) # Use names to specify which variables to initialize from checkpoint. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/var1': 'new_scope_1/var1', 'old_scope_1/var2': 'new_scope_2/var2'}) # Or use tf.Variable objects to identify what to initialize. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/var1': var1, 'old_scope_1/var2': var2}) # Initialize partitioned variables using variable's name init_from_checkpoint('/tmp/model.ckpt', {'old_scope_2/var3': 'new_scope_2/var3'}) # Or specify the list of tf.Variable objects. init_from_checkpoint('/tmp/model.ckpt', {'old_scope_2/var3': var3._get_variable_list()}) ``` Args: ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint. assignment_map: Dict, where keys are names of the variables in the checkpoint and values are current variables or names of current variables (in default graph). Raises: tf.errors.OpError: If missing checkpoints or tensors in checkpoints. ValueError: If missing variables in current graph. """ if distribution_strategy_context.get_cross_tower_context(): _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map) else: distribution_strategy_context.get_tower_context().merge_call( _init_from_checkpoint, ckpt_dir_or_file, assignment_map)