def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works for g, v in grads_and_vars: if not isinstance(g, (ops.Tensor, ops.IndexedSlices, type(None))): raise TypeError( "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) if not isinstance(v, variables.Variable): raise TypeError( "Variable must be a tf.Variable: %s" % v) if g is not None: self._assert_valid_dtypes([g, v]) var_list = [v for g, v in grads_and_vars if g is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s" % (grads_and_vars,)) with ops.control_dependencies(None): self._create_slots(var_list) update_ops = [] with ops.op_scope([], name, self._name) as name: self._prepare() for grad, var in grads_and_vars: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. with ops.name_scope("update_" + var.op.name), ops.colocate_with(var): if isinstance(grad, ops.Tensor): update_ops.append(self._apply_dense(grad, var)) else: update_ops.append(self._apply_sparse(grad, var)) if global_step is None: return self._finish(update_ops, name) else: with ops.control_dependencies([self._finish(update_ops, "update")]): with ops.colocate_with(global_step): return state_ops.assign_add(global_step, 1, name=name).op
def testMultiColocationGroups(self): a = constant_op.constant([2.0], name="a") b = constant_op.constant(3.0, name="b") with ops.colocate_with(a.op): with ops.colocate_with(b.op): c = constant_op.constant(4.0) self.assertEqual(set([b"loc:@a", b"loc:@b"]), set(c.op.colocation_groups()))
def _create_variable(self, next_creator, *args, **kwargs): if self._num_replicas_in_sync > 1: aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE) if aggregation not in ( vs.VariableAggregation.NONE, vs.VariableAggregation.SUM, vs.VariableAggregation.MEAN, vs.VariableAggregation.ONLY_FIRST_REPLICA ): raise ValueError("Invalid variable aggregation mode: " + aggregation + " for variable: " + kwargs["name"]) def var_creator(*args, **kwargs): """Create an AggregatingVariable and fix up collections.""" # Record what collections this variable should be added to. collections = kwargs.pop("collections", None) if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] kwargs["collections"] = [] # Create and wrap the variable. v = next_creator(*args, **kwargs) wrapped = values.AggregatingVariable( self._container_strategy(), v, aggregation) # Add the wrapped variable to the requested collections. # The handling of eager mode and the global step matches # ResourceVariable._init_from_args(). if not context.executing_eagerly(): g = ops.get_default_graph() # If "trainable" is True, next_creator() will add the contained # variable to the TRAINABLE_VARIABLES collection, so we manually # remove it and replace with the wrapper. We can't set "trainable" # to False for next_creator() since that causes functions like # implicit_gradients to skip those variables. if kwargs.get("trainable", True): collections.append(ops.GraphKeys.TRAINABLE_VARIABLES) l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES) if v in l: l.remove(v) g.add_to_collections(collections, wrapped) elif ops.GraphKeys.GLOBAL_STEP in collections: ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped) return wrapped else: var_creator = next_creator if "colocate_with" in kwargs: colocate_with = kwargs["colocate_with"] if isinstance(colocate_with, numpy_dataset.SingleDevice): with ops.device(colocate_with.device): return var_creator(*args, **kwargs) with ops.device(None): with ops.colocate_with(colocate_with): return var_creator(*args, **kwargs) with ops.colocate_with(None, ignore_existing=True): with ops.device(self._variable_device): return var_creator(*args, **kwargs)
def testColocationIgnoreStack(self): a = constant_op.constant([2.0], name="a") b = constant_op.constant(3.0, name="b") with ops.colocate_with(a.op): with ops.colocate_with(b.op, ignore_existing=True): c = constant_op.constant(4.0) self.assertEqual(set([b"loc:@b"]), set(c.op.colocation_groups()))
def _f(): # Note that there is a race condition here, so we do a best effort # updates here. We reset update_in_steps first so that other workers # don't duplicate the updates. Also we update cluster_center_vars # before resetting total_counts to avoid large updates to # cluster_centers_updated based on partially updated # cluster_center_vars. with ops.control_dependencies([ state_ops.assign(update_in_steps, self._mini_batch_steps_per_iteration - 1) ]): with ops.colocate_with( cluster_centers_updated, ignore_existing=True): if self._distance_metric == COSINE_DISTANCE: cluster_centers = nn_impl.l2_normalize( cluster_centers_updated, dim=1) else: cluster_centers = cluster_centers_updated with ops.colocate_with(cluster_centers_var, ignore_existing=True): with ops.control_dependencies( [state_ops.assign(cluster_centers_var, cluster_centers)]): with ops.colocate_with(None, ignore_existing=True): with ops.control_dependencies([ state_ops.assign(total_counts, array_ops.zeros_like(total_counts)) ]): return array_ops.identity(update_in_steps)
def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers): """Creates an op for training for full batch case. Args: inputs: list of input Tensors. cluster_idx_list: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. cluster_centers: Tensor Ref of cluster centers. Returns: An op for doing an update of mini-batch k-means. """ cluster_sums = [] cluster_counts = [] epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype) for inp, cluster_idx in zip(inputs, cluster_idx_list): with ops.colocate_with(inp): cluster_sums.append( math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters)) cluster_counts.append( math_ops.unsorted_segment_sum( array_ops.reshape( array_ops.ones( array_ops.reshape(array_ops.shape(inp)[0], [-1])), [-1, 1]), cluster_idx, self._num_clusters)) with ops.colocate_with(cluster_centers): new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast( math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon) if self._clusters_l2_normalized(): new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1) return state_ops.assign(cluster_centers, new_clusters_centers)
def testColocateWithBeforeCond(self): with ops.Graph().as_default() as g: with self.session(graph=g): a = constant_op.constant([2.0], name="a") b = constant_op.constant([2.0], name="b") def fn(): c = constant_op.constant(3.0) self.assertEqual([b"loc:@a"], c.op.colocation_groups()) return c with ops.colocate_with(a.op): self.assertEquals( cond_v2.cond_v2(constant_op.constant(True), fn, fn).eval(), 3) def fn2(): c = constant_op.constant(3.0) self.assertEqual([b"loc:@a", b"loc:@b"], c.op.colocation_groups()) return c with ops.colocate_with(a.op): with ops.colocate_with(b.op): self.assertEquals( cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0, and decay=1 meaning no updates. r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r)) d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d)) decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving value. # Make sure the weight is not updated until before r and d computation. value = array_ops.identity(value) with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average( var, value, decay, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, decay, zero_debias=False) return new_var / new_weight with ops.colocate_with(self.moving_mean): new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) with ops.colocate_with(self.moving_variance): new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def batch_norm(input_, dim, name, scale=True, train=True, epsilon=1e-8, decay=.1, axes=[0], bn_lag=DEFAULT_BN_LAG): """Batch normalization.""" # create variables with tf.variable_scope(name): var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) if scale: gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.)) beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.)) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_mean /= (1. - bn_lag**(step + 1)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # normalize res = (input_ - used_mean) / tf.sqrt(used_var + epsilon) # de-normalize if scale: res *= gamma res += beta # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) res += 0. * new_mean * new_var * new_step return res
def testNestedColocateWith(self): a = constant_op.constant([2.0], name="a") with ops.colocate_with(a.op): b = constant_op.constant(3.0) with ops.colocate_with(b.op): c = constant_op.constant(4.0) self.assertEqual([b"loc:@a"], b.op.colocation_groups()) self.assertEqual([b"loc:@a"], c.op.colocation_groups())
def add_variable(self, name, shape, dtype=None, initializer=None, regularizer=None, trainable=True): """Adds a new variable to the layer, or gets an existing one; returns it. Arguments: name: variable name. shape: variable shape. dtype: The type of the variable. Defaults to `self.dtype`. initializer: initializer instance (callable). regularizer: regularizer instance (callable). trainable: whether the variable should be part of the layer's "trainable_variables" (e.g. variables, biases) or "non_trainable_variables" (e.g. BatchNorm mean, stddev). Returns: The created variable. """ if dtype is None: dtype = self.dtype existing_variables = set(tf_variables.global_variables()) self._set_scope(None) with vs.variable_scope(self._scope, reuse=self.built or self._reuse) as scope: with ops.name_scope(scope.original_name_scope): variable = vs.get_variable(name, shape=shape, initializer=initializer, dtype=dtypes.as_dtype(dtype), trainable=trainable and self.trainable) if variable in existing_variables: return variable if regularizer: # To match the behavior of tf.get_variable(), we only # apply regularization if the variable is newly created. if isinstance(variable, tf_variables.PartitionedVariable): for v in variable: with ops.colocate_with(v.op): with ops.name_scope(name + '/Regularizer'): regularization = regularizer(v) if regularization is not None: self.add_loss(regularization) _add_elements_to_collection( regularization, ops.GraphKeys.REGULARIZATION_LOSSES) else: with ops.colocate_with(variable.op): with ops.name_scope(name + '/Regularizer'): regularization = regularizer(variable) if regularization is not None: self.add_loss(regularization) _add_elements_to_collection( regularization, ops.GraphKeys.REGULARIZATION_LOSSES) if trainable: self._trainable_weights.append(variable) else: self._non_trainable_weights.append(variable) return variable
def batch_norm_log_diff(input_, dim, name, train=True, epsilon=1e-8, decay=.1, axes=[0], reuse=None, bn_lag=DEFAULT_BN_LAG): """Batch normalization with corresponding log determinant Jacobian.""" if reuse is None: reuse = not train # create variables with tf.variable_scope(name) as scope: if reuse: scope.reuse_variables() var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_var = stable_var(input_=input_, mean=used_mean, axes=axes) cur_var = used_var used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_mean /= (1. - bn_lag**(step + 1)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics( decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) used_var += 0. * new_mean * new_var * new_step used_var += epsilon return used_mean, used_var
def _create_variable(self, next_creator, *args, **kwargs): if "colocate_with" in kwargs: with ops.device(None): with ops.colocate_with(kwargs["colocate_with"]): return next_creator(*args, **kwargs) with ops.colocate_with(None, ignore_existing=True): with ops.device(self._variable_device): return next_creator(*args, **kwargs)
def _add_variable( self, name, shape, dtype=None, initializer=None, regularizer=None, trainable=True, variable_getter=vs.get_variable, ): """Adds a new variable to the layer. Arguments: name: variable name. shape: variable shape. dtype: The type of the variable. Defaults to `self.dtype`. initializer: initializer instance (callable). regularizer: regularizer instance (callable). trainable: whether the variable should be part of the layer's "trainable_variables" (e.g. variables, biases) or "non_trainable_variables" (e.g. BatchNorm mean, stddev). variable_getter: The getter to use for TensorFlow variables. Returns: The created variable. """ if dtype is None: dtype = self.dtype existing_variables = set(tf_variables.global_variables()) variable = variable_getter( name, shape=shape, initializer=initializer, dtype=dtype, trainable=trainable and self.trainable ) # TODO(sguada) fix name = variable.op.name if regularizer: if not self._reuse and variable not in existing_variables: # To match the behavior of tf.get_variable(), we only # apply regularization if the variable is newly created. if isinstance(variable, tf_variables.PartitionedVariable): for v in variable: with ops.colocate_with(v.op): with ops.name_scope(name + "/Regularizer"): regularization = regularizer(v) if regularization is not None: self._losses.append(regularization) _add_elements_to_collection(regularization, ops.GraphKeys.REGULARIZATION_LOSSES) else: with ops.colocate_with(variable.op): with ops.name_scope(name + "/Regularizer"): regularization = regularizer(variable) if regularization is not None: self._losses.append(regularization) _add_elements_to_collection(regularization, ops.GraphKeys.REGULARIZATION_LOSSES) if trainable: self._trainable_variables.append(variable) else: self._non_trainable_variables.append(variable) return variable
def maybe_colocate_with(op): """Context to colocate with `op` if `COLOCATE_COV_OPS_WITH_INPUTS`.""" if COLOCATE_COV_OPS_WITH_INPUTS: if isinstance(op, (list, tuple)): with tf_ops.colocate_with(op[0]): yield else: with tf_ops.colocate_with(op): yield else: yield
def _maybe_colocate_with(op, colocate_cov_ops_with_inputs): """Context to colocate with `op` if `colocate_cov_ops_with_inputs`.""" if colocate_cov_ops_with_inputs: if isinstance(op, (list, tuple)): with tf_ops.colocate_with(op[0]): yield else: with tf_ops.colocate_with(op): yield else: yield
def create_axis_ops(sp_input, num_items, update_fn, axis_name): """Creates book-keeping and training ops for a given axis. Args: sp_input: A SparseTensor corresponding to the row or column batch. num_items: An integer, the total number of items of this axis. update_fn: A function that takes one argument (`sp_input`), and that returns a tuple of * new_factors: A flot Tensor of the factor values after update. * update_op: a TensorFlow op which updates the factors. * loss: A float Tensor, the unregularized loss. * reg_loss: A float Tensor, the regularization loss. * sum_weights: A float Tensor, the sum of factor weights. axis_name: A string that specifies the name of the axis. Returns: A tuple consisting of: * reset_processed_items_op: A TensorFlow op, to be run before the beginning of any sweep. It marks all items as not-processed. * axis_train_op: A Tensorflow op, to be run during this axis' sweeps. """ processed_items_init = array_ops.fill(dims=[num_items], value=False) with ops.colocate_with(processed_items_init): processed_items = variable_scope.variable( processed_items_init, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="processed_" + axis_name) reset_processed_items_op = state_ops.assign( processed_items, processed_items_init, name="reset_processed_" + axis_name) _, update_op, loss, reg, sum_weights = update_fn(sp_input) input_indices = sp_input.indices[:, 0] with ops.control_dependencies([ update_op, state_ops.assign(loss_var, loss + reg), state_ops.assign(rwse_var, math_ops.sqrt(loss / sum_weights))]): with ops.colocate_with(processed_items): update_processed_items = state_ops.scatter_update( processed_items, input_indices, array_ops.ones_like(input_indices, dtype=dtypes.bool), name="update_processed_{}_indices".format(axis_name)) with ops.control_dependencies([update_processed_items]): is_sweep_done = math_ops.reduce_all(processed_items) axis_train_op = control_flow_ops.group( global_step_incr_op, state_ops.assign(is_sweep_done_var, is_sweep_done), state_ops.assign_add( completed_sweeps_var, math_ops.cast(is_sweep_done, dtypes.int32)), name="{}_sweep_train_op".format(axis_name)) return reset_processed_items_op, axis_train_op
def _cached_copy(self, var, name, pass_through=False): """Helper function to create a worker cached copy of a Variable. This assigns the var (either a single Variable or a list of Variables) to local transient cache Variable(s). Note that if var is a list of Variables, the assignment is done sequentially to minimize the memory overheads. Also note that if pass_through is set to True, this does not create new Variables but simply return the input back. Args: var: A Variable or a list of Variables to cache. name: name of cached Variable. pass_through: when set to True, this simply pass through the var back through identity operator and does not actually creates a cache. Returns: Tuple consisting of following three entries: cache: the new transient Variable or list of transient Variables corresponding one-to-one with var. cache_init: op to initialize the Variable or the list of Variables. cache_reset: op to reset the Variable or the list of Variables to some default value. """ if var is None: return None, None, None elif pass_through: cache = var cache_init = control_flow_ops.no_op() cache_reset = control_flow_ops.no_op() elif isinstance(var, variables.Variable): cache = WALSModel._transient_var(name=name) with ops.colocate_with(cache): cache_init = state_ops.assign(cache, var, validate_shape=False) cache_reset = state_ops.assign(cache, 1.0, validate_shape=False) else: assert isinstance(var, list) assert var cache = [ WALSModel._transient_var(name="%s_shard_%d" % (name, i)) for i in xrange(len(var)) ] reset_ops = [] for i, c in enumerate(cache): with ops.colocate_with(c): if i == 0: cache_init = state_ops.assign(c, var[i], validate_shape=False) else: with ops.control_dependencies([cache_init]): cache_init = state_ops.assign(c, var[i], validate_shape=False) reset_ops.append(state_ops.assign(c, 1.0, validate_shape=False)) cache_reset = control_flow_ops.group(*reset_ops) return cache, cache_init, cache_reset
def _GatherGrad(op, grad): """Gradient for Gather op.""" params = op.inputs[0] indices = op.inputs[1] # Colocate shape computations with params and indices with ops.colocate_with(params): params_shape = array_ops.shape(params) with ops.colocate_with(indices): size = array_ops.expand_dims(array_ops.size(indices), 0) values_shape = array_ops.concat(0, [size, params_shape[1:]]) values = array_ops.reshape(grad, values_shape) indices = array_ops.reshape(indices, size) return [ops.IndexedSlices(values, indices, params_shape), None]
def _initialize_variables(self, data, initial_means=None): """Initializes variables. Args: data: a list of Tensors with data, each row is a new example. initial_means: a Tensor with a matrix of means. """ first_shard = data[0] # Initialize means: num_classes X 1 X dimensions. if initial_means is not None: means = array_ops.expand_dims(initial_means, 1) else: # Sample data randomly means = array_ops.expand_dims( _init_clusters_random(data, self._num_classes, self._random_seed), 1) # Initialize covariances. if self._covariance_type == FULL_COVARIANCE: cov = _covariance(first_shard, False) + self._min_var # A matrix per class, num_classes X dimensions X dimensions covs = array_ops.tile( array_ops.expand_dims(cov, 0), [self._num_classes, 1, 1]) elif self._covariance_type == DIAG_COVARIANCE: cov = _covariance(first_shard, True) + self._min_var # A diagonal per row, num_classes X dimensions. covs = array_ops.tile( array_ops.expand_dims(array_ops.diag_part(cov), 0), [self._num_classes, 1]) with ops.colocate_with(self._cluster_centers_initialized): initialized = control_flow_ops.with_dependencies( [means, covs], array_ops.identity(self._cluster_centers_initialized)) self._init_ops = [] with ops.colocate_with(self._means): init_means = state_ops.assign(self._means, means, validate_shape=False) init_means = control_flow_ops.with_dependencies( [init_means], state_ops.assign(self._cluster_centers_initialized, True)) self._init_ops.append(control_flow_ops.cond(initialized, control_flow_ops.no_op, lambda: init_means).op) with ops.colocate_with(self._covs): init_covs = state_ops.assign(self._covs, covs, validate_shape=False) init_covs = control_flow_ops.with_dependencies( [init_covs], state_ops.assign(self._cluster_centers_initialized, True)) self._init_ops.append(control_flow_ops.cond(initialized, control_flow_ops.no_op, lambda: init_covs).op)
def _train_op_fn(loss): """Returns the op to optimize the loss.""" update_op = gbdt_model_main.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op
def init_variable(v, init, name="init"): """Initializes variable with "init". This op does the following: if init is a Tensor, v = init if callable(init): v = init(VariableShape(v), v.dtype) Args: v: Variable to initialize init: Tensor to assign to v, Or an object convertible to Tensor e.g. nparray, Or an Initializer that generates a tensor given the shape and type of v. An "Initializer" is a callable that returns a tensor that "v" should be set to. It will be called as init(shape, dtype). name: Optional name for the op. Returns: The operation that initializes v. """ with ops.name_scope(None, v.op.name + "/", [v, init]): with ops.name_scope(name) as scope: with ops.colocate_with(v): if callable(init): assert v.get_shape().is_fully_defined(), "Variable shape unknown." # TODO(mrry): Convert to v.shape when the property and # accessor are reconciled (and all initializers support # tf.TensorShape objects). value = init(v.get_shape().as_list(), v.dtype.base_dtype) value = ops.convert_to_tensor(value, name="value") return gen_state_ops.assign(v, value, name=scope) else: init = ops.convert_to_tensor(init, name="init") return gen_state_ops.assign(v, init, name=scope)
def _l2_normalize_data(cls, inputs): """Normalized the input data.""" output = [] for inp in inputs: with ops.colocate_with(inp): output.append(nn_impl.l2_normalize(inp, dim=1)) return output
def _maybe_colocate_with(op, colocate_gradients_with_ops): """Context to colocate with `op` if `colocate_gradients_with_ops`.""" if colocate_gradients_with_ops: with ops.colocate_with(op): yield else: yield
def _compute_euclidean_distance(cls, inputs, clusters): """Computes Euclidean distance between each input and each cluster center. Args: inputs: list of input Tensors. clusters: cluster Tensor. Returns: list of Tensors, where each element corresponds to each element in inputs. The value is the distance of each row to all the cluster centers. """ output = [] for inp in inputs: with ops.colocate_with(inp, ignore_existing=True): # Computes Euclidean distance. Note the first and third terms are # broadcast additions. squared_distance = ( math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) - 2 * math_ops.matmul(inp, clusters, transpose_b=True) + array_ops.transpose( math_ops.reduce_sum( math_ops.square(clusters), 1, keep_dims=True))) output.append(squared_distance) return output
def build_nccl_all_reduce(input_tensors, red_op, un_op=None): """Build a subgraph that does one full all-reduce, using NCCL. Args: input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. Must be one of {tf.add} un_op: optional unary elementwise Op to apply to fully-reduce values. Returns: list of T `tf.Tensor` of reduced values. Raises: ValueError: red_op not supported. """ if red_op == math_ops.add: output_tensors = nccl_ops.all_sum(input_tensors) else: raise ValueError("red_op not supported by NCCL all-reduce: ", red_op) if un_op: un_op_wrapped = [] for t in output_tensors: with ops.colocate_with(t): un_op_wrapped.append(un_op(t)) output_tensors = un_op_wrapped return output_tensors
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. Calls tpu_ops.cross_replica_sum() to sum gradient contributions across replicas, and then applies the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: An `Operation` that applies the gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: ValueError: If the grads_and_vars is malformed. """ summed_grads_and_vars = [] for (grad, var) in grads_and_vars: if grad is None: summed_grads_and_vars.append((grad, var)) else: with ops.colocate_with(grad): summed_grads_and_vars.append((tpu_ops.cross_replica_sum( grad, self._group_assignment), var)) return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
def _SumGrad(op, grad): """Gradient for Sum.""" # Fast path for when reducing to a scalar and ndims is known: adds only # Reshape and Tile ops (and possibly a Shape). input_0_shape = op.inputs[0]._shape_tuple() # pylint: disable=protected-access if input_0_shape is not None: axes = tensor_util.constant_value(op.inputs[1]) if axes is not None: rank = len(input_0_shape) if np.array_equal(axes, np.arange(rank)): # Reduce all dims. grad = array_ops.reshape(grad, [1] * rank) # If shape is not fully defined (but rank is), we use Shape. if None not in input_0_shape: input_shape = input_0_shape else: input_shape = array_ops.shape(op.inputs[0]) return [array_ops.tile(grad, input_shape), None] input_shape = array_ops.shape(op.inputs[0]) # TODO(apassos) remove this once device placement for eager ops makes more # sense. with ops.colocate_with(input_shape): output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1]) tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims) grad = array_ops.reshape(grad, output_shape_kept_dims) return [array_ops.tile(grad, tile_scaling), None]
def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None): """Construct the gather (concentrate and reduce) phase of shuffle all-reduce. Args: input_tensors: list of T @(tf.Tensor} values to be reduced. gather_devices: list of names of devices on which reduction shards should be placed. red_op: the binary reduction Op un_op: optional elementwise unary Op to be applied to fully-reduced values. Returns: list of T `tf.Tensor` which are the fully reduced shards. Raises: ValueError: inputs not well-formed. """ num_source_devices = len(input_tensors) num_gather_devices = len(gather_devices) shape = input_tensors[0].shape if len(shape) != 1: raise ValueError("input_tensors must be 1D") shards_by_source = [] for d in range(0, num_source_devices): with ops.colocate_with(input_tensors[d]): shards_by_source.append( _ragged_split(input_tensors[d], num_gather_devices)) reduced_shards = [] for d in range(0, num_gather_devices): with ops.device(gather_devices[d]): values = [s[d] for s in shards_by_source] red_shard = red_op(values) if un_op: red_shard = un_op(red_shard) reduced_shards.append(red_shard) return reduced_shards
def _flatten_tensors(tensors): """Check tensors for isomorphism and flatten. Args: tensors: list of T `tf.Tensor` which must all have the same shape. Returns: tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors shape: the original shape of each element of input tensors Raises: ValueError: tensors are empty or non-isomorphic or have unknown shape. """ if not tensors: raise ValueError("tensors cannot be empty") shape = tensors[0].shape for tensor in tensors: shape = shape.merge_with(tensor.shape) if not shape.is_fully_defined(): raise ValueError("Tensors must have statically known shape.") if len(shape) != 1: reshaped = [] for t in tensors: with ops.colocate_with(t): reshaped.append(array_ops.reshape(t, [-1])) tensors = reshaped return tensors, shape
def __init__( self, key_dtype=dtypes.int64, value_dtype=dtypes.float32, dim=1, devices=None, partitioner=default_partition_fn, shared_name=None, name="DynamicEmbedding_Variable", initializer=None, trainable=True, checkpoint=True, init_size=0, kv_creator=None, restrict_policy=None, bp_v2=False, ): """Creates an empty `Variable` object. Creates a group of tables placed on devices specified by `devices`, and the device placement mechanism of TensorFlow will be ignored, the type of its keys and values are specified by key_dtype and value_dtype, respectively. The environment variables 'TF_HASHTABLE_INIT_SIZE' can be used to set the inital size of each tables, which can help reduce rehash times. The default initial table size is 8,192 Args: key_dtype: the type of the key tensors. value_dtype: the type of the value tensors. dim: the length of the value array for each key, on GPUs, `dim` should be less or equal to 200. devices: the list of devices holding the tables. One table will be created on each device. By default, `devices` is ['/CPU:0'] and when GPU is available, `devices` is ['/GPU:0'] partitioner: partition function of keys, return the partition index for each key. Example partition func: ```python def default_partition_fn(keys, shard_num): return tf.cast(keys % shard_num, dtype=tf.int32) ``` shared_name: No used. name: A name for the operation (optional). initializer: The value to use if a key is missing in the hash table. which can be a python number, numpy array or `tf.initializer` instances. If initializer is `None` (the default), `0` will be taken. trainable: Bool. If true, the variable will be treated as a trainable. Default is true. checkpoint: if True, the contents of the SparseVariable are saved to and restored from checkpoints. If `shared_name` is empty for a checkpointed table, it is shared using the table node name. init_size: initial size for the Variable and initial size of each hash tables will be int(init_size / N), N is the number of the devices. restrict_policy: a restrict policy to specify the rule to restrict the size of variable. If in training program, the variable is updated by optimizer, then the sparse slot variables in optimizer are also be restricted. bp_v2: By default with `bp_v2=False`, the optimizer will update dynamic embedding values by *setting* (key, value) after `optimizer.apply_gradient`. If one key is used by multiple workers at the same time, only one of them will be seen, while the others are overwritten. By setting `bp_v2=True`, the optimizer will update parameters by *adding delta* instead of *setting*, which solves the race condition problem among workers during backpropagation in large-scale distributed asynchronous training. Returns: A `Variable` object. """ self.key_dtype = key_dtype self.value_dtype = value_dtype self.dim = dim self.bp_v2 = bp_v2 def _get_default_devices(): gpu_list = [ x.name for x in device_lib.list_local_devices() if x.device_type == "GPU" ] return gpu_list[0:1] or [ "/CPU:0", ] devices_ = devices or _get_default_devices() self.devices = (devices_ if isinstance(devices_, list) else [ devices, ]) self.partition_fn = partitioner self.name = name self.shared_name = shared_name or "shared_name.{}".format(name) self.initializer = None self.trainable = trainable self.checkpoint = checkpoint self._tables = data_structures.ListWrapper([]) self._track_trackable(self._tables, 'tables_of_{}'.format(self.name), overwrite=True) self.size_ops = [] self._trainable_store = {} self.kv_creator = kv_creator if kv_creator else de.CuckooHashTableCreator() self.shard_num = len(self.devices) self.init_size = int(init_size) if restrict_policy is not None: if not issubclass(restrict_policy, de.RestrictPolicy): raise TypeError('restrict_policy must be subclass of RestrictPolicy.') self._restrict_policy = restrict_policy(self) else: self._restrict_policy = None valid_dtype_list = [[dtypes.int64, dtypes.float32], [dtypes.int64, dtypes.half], [dtypes.int64, dtypes.int32], [dtypes.int64, dtypes.int8], [dtypes.int64, dtypes.int64], [dtypes.int64, dtypes.float64], [dtypes.int64, dtypes.string], [dtypes.int32, dtypes.float32], [dtypes.int32, dtypes.int32], [dtypes.int32, dtypes.float64], [dtypes.string, dtypes.float32], [dtypes.string, dtypes.half], [dtypes.string, dtypes.int32], [dtypes.string, dtypes.int8], [dtypes.string, dtypes.int64], [dtypes.string, dtypes.float64], [dtypes.string, dtypes.bool]] if "GPU" in self.devices[0].upper(): valid_dtype_list = [ [dtypes.int64, dtypes.float32], [dtypes.int64, dtypes.half], [dtypes.int64, dtypes.int32], [dtypes.int64, dtypes.int8], [dtypes.int64, dtypes.int64], [dtypes.int32, dtypes.float32], ] if is_macos() and is_arm64(): if value_dtype == dtypes.half: raise TypeError(""" float16 value dtype is not supported on macOS with ARM64 architecture. Please try another type. """) if [key_dtype, value_dtype] not in valid_dtype_list: raise TypeError( "key-value dtype ({}-{}) is not support! The valid dtypes are \n{}\n". format(key_dtype, value_dtype, valid_dtype_list)) _initializer = initializer if _initializer is None: _initializer = init_ops.zeros_initializer(dtype=self.value_dtype) static_default_value = self._convert_anything_to_init(_initializer, dim) scope_name = self.name.split("/")[-1] with ops.name_scope(scope_name, "DynamicEmbedding_Variable"): with ops.colocate_with(None, ignore_existing=True): for idx in range(len(self.devices)): with ops.device(self.devices[idx]): mht = None if not issubclass(self.kv_creator.__class__, de.KVCreator): raise TypeError("config should be instance of 'config', but got ", str(type(self.kv_creator))) mht = self.kv_creator.create( key_dtype=self.key_dtype, value_dtype=self.value_dtype, default_value=static_default_value, name=self._make_name(idx), checkpoint=self.checkpoint, init_size=int(self.init_size / self.shard_num), ) self._tables.append(mht)
def _GatherV2Grad(op, grad): """Gradient for GatherV2 op.""" # params can be large, so colocate the shape calculation with it. # # params can be very large for sparse model, array_ops.shape raises # exception on the Windows platform when any dimension is larger than # int32. params_shape is not used in optimizer apply_sparse gradients, # so it's fine to convert it back to int32 regardless of truncation. params = op.inputs[0] with ops.colocate_with(params): params_shape = array_ops.shape(params, out_type=ops.dtypes.int64) params_shape = math_ops.cast(params_shape, dtypes.int32) indices = op.inputs[1] indices_size = array_ops.expand_dims(array_ops.size(indices), 0) axis = op.inputs[2] axis_static = tensor_util.constant_value(axis) batch_dims = int(op.get_attr("batch_dims")) if batch_dims < 0: batch_dims += indices.shape.ndims # For axis 0 gathers, build an appropriately shaped IndexedSlices. if axis_static == 0: if context.executing_eagerly(): params_tail_shape = params_shape.cpu()[1:] else: params_tail_shape = params_shape[1:] values_shape = array_ops.concat([indices_size, params_tail_shape], 0) values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad), values_shape) indices = array_ops.reshape(indices, indices_size) params_grad = ops.IndexedSlices(values, indices, params_shape) else: # Handle axis by transposing the axis dimension to be the first non-batch # dimension, compute the gradiend and transpose the result back. outer_shape = params_shape[:axis] inner_shape = params_shape[axis:][1:] values_shape = array_ops.concat([outer_shape, [-1], inner_shape], 0) values_dims = array_ops.size(values_shape) axis_dims = array_ops.size(outer_shape) outer_batches_indices = math_ops.range(batch_dims) batch_axis_indices = math_ops.range(batch_dims, axis_dims) inner_axes_indices = math_ops.range(axis_dims + 1, values_dims) values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad), values_shape) # Move values[axis] up to values[batch_dims] transpose_dims = array_ops.concat([ outer_batches_indices, [axis_dims], batch_axis_indices, inner_axes_indices ], 0) values_transpose = array_ops.transpose(values, transpose_dims) params_grad = _BatchGatherGrad(params_shape, values_transpose, indices, batch_dims, params_shape[axis]) # Inverts the above transpose by moving dimension batch_dims back to its # original position. invert_transpose_dims = array_ops.concat([ outer_batches_indices, batch_axis_indices + 1, [batch_dims], inner_axes_indices ], 0) params_grad = array_ops.transpose(params_grad, invert_transpose_dims) return [params_grad, None, None]
def _init_from_args(self, initial_value=None, trainable=True, collections=None, validate_shape=True, caching_device=None, name=None, dtype=None, constraint=None): """Creates a variable. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, the default, also adds the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default list of variables to use by the `Optimizer` classes. collections: List of graph collections keys. The new variable is added to these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. validate_shape: Ignored. Provided for compatibility with tf.Variable. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). constraint: An optional projection function to be applied to the variable after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected Tensor representing the value of the variable and return the Tensor for the projected value (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. @compatibility(eager) When Eager Execution is enabled, variables are never added to collections. It is not implicitly added to the GLOBAL_VARIABLES or TRAINABLE_VARIABLES collections, and the `collections` argument is ignored. @end_compatibility """ if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] if not isinstance(collections, (list, tuple, set)): raise ValueError( "collections argument to Variable constructor must be a list, tuple, " "or set. Got %s of type %s" % (collections, type(collections))) if constraint is not None and not callable(constraint): raise ValueError("The `constraint` argument must be a callable.") self._trainable = trainable if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ ops.GraphKeys.TRAINABLE_VARIABLES ] self._save_slice_info = None self._in_graph_mode = context.in_graph_mode() with ops.control_dependencies(None): with ops.name_scope( name, "Variable", [] if init_from_fn else [initial_value]) as name: # pylint: disable=protected-access handle_name = ops._name_from_scope_name(name) if init_from_fn: # Use attr_scope and device(None) to simulate the behavior of # colocate_with when the variable we want to colocate with doesn't # yet exist. if self._in_graph_mode: attr = attr_value_pb2.AttrValue( list=attr_value_pb2.AttrValue.ListValue( s=[compat.as_bytes("loc:@%s" % handle_name)])) with ops.get_default_graph()._attr_scope( {"_class": attr}): with ops.name_scope("Initializer"), ops.device( None): initial_value = ops.convert_to_tensor( initial_value(), name="initial_value", dtype=dtype) self._handle = _eager_safe_variable_handle( shape=initial_value.get_shape(), dtype=initial_value.dtype.base_dtype, shared_name=handle_name, name=name, graph_mode=self._in_graph_mode) self._handle_device = ( self._handle.device if self._in_graph_mode else context.get_default_context().device_name) else: initial_value = initial_value() with ops.name_scope("Initializer"): initial_value = ops.convert_to_tensor( initial_value, name="initial_value", dtype=dtype) self._handle = _eager_safe_variable_handle( shape=initial_value.get_shape(), dtype=initial_value.dtype.base_dtype, shared_name=handle_name, name=name, graph_mode=False, container="") self._handle_device = ( self._handle.device if self._in_graph_mode else context.get_default_context().device_name) # pylint: enable=protected-access # Or get the initial value from a Tensor or Python object. else: with ops.name_scope("Initializer"): initial_value = ops.convert_to_tensor( initial_value, name="initial_value", dtype=dtype) # pylint: disable=protected-access if (self._in_graph_mode and initial_value is not None and initial_value.op._get_control_flow_context() is not None): raise ValueError( "Initializer for variable %s is from inside a control-flow " "construct, such as a loop or conditional. When creating a " "variable inside a loop or conditional, use a lambda as the " "initializer." % name) # pylint: enable=protected-access self._handle = _eager_safe_variable_handle( shape=initial_value.get_shape(), dtype=initial_value.dtype.base_dtype, shared_name=handle_name, name=name, graph_mode=self._in_graph_mode, container="") self._handle_device = ( self._handle.device if self._in_graph_mode else context.get_default_context().device_name) self._initial_value = initial_value if self._in_graph_mode else None self._handle_name = handle_name + ":0" self._dtype = initial_value.dtype.base_dtype self._constraint = constraint if self._in_graph_mode: with ops.name_scope("IsInitialized"): self._is_initialized_op = ( gen_resource_variable_ops.var_is_initialized_op( self._handle)) if initial_value is not None: with ops.name_scope("Assign") as n, ops.colocate_with( self._handle): self._initializer_op = ( gen_resource_variable_ops.assign_variable_op( self._handle, self._build_initializer_expr( initial_value), name=n)) with ops.name_scope("Read"), ops.colocate_with( self._handle): # Manually assign reads to the handle's device to avoid log # messages. with ops.device(self._handle_device): value = self._read_variable_op() self._graph_element = value if caching_device is not None: # Variables may be created in a tf.device() or ops.colocate_with() # context. At the same time, users would expect caching device to # be independent of this context, and/or would not expect the # current device context to be merged with the caching device # spec. Therefore we reset the colocation stack before creating # the cached value. Note that resetting the colocation stack will # also reset the device stack. with ops.colocate_with(None, ignore_existing=True): with ops.device(caching_device): self._cached_value = array_ops.identity( value) else: self._cached_value = None else: gen_resource_variable_ops.assign_variable_op( self._handle, initial_value) self._is_initialized_op = None self._initializer_op = None self._graph_element = None if caching_device: with ops.device(caching_device): self._cached_value = self._read_variable_op() else: self._cached_value = None if context.in_graph_mode(): ops.add_to_collections(collections, self) elif ops.GraphKeys.GLOBAL_STEP in collections: ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
def Body(v): with ops.colocate_with(external_t): return v * v
def _init_from_args(self, initial_value=None, trainable=True, collections=None, validate_shape=True, caching_device=None, name=None, dtype=None): """Creates a new variable from arguments. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. In that case, `dtype` must be specified. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, the default, also adds the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default list of variables to use by the `Optimizer` classes. collections: List of graph collections keys. The new variable is added to these collections. Defaults to `[GraphKeys.VARIABLES]`. validate_shape: If `False`, allows the variable to be initialized with a value of unknown shape. If `True`, the default, the shape of `initial_value` must be known. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. """ if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if init_from_fn and dtype is None: raise ValueError( "dtype must also be specified when initial_value is callable.") if collections is None: collections = [ops.GraphKeys.VARIABLES] if not isinstance(collections, (list, tuple, set)): raise ValueError( "collections argument to Variable constructor must be a list, tuple, " "or set. Got %s of type %s" % (collections, type(collections))) if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ ops.GraphKeys.TRAINABLE_VARIABLES ] with ops.control_dependencies(None): with ops.name_scope( name, "Variable", [] if init_from_fn else [initial_value]) as name: # Get the initial value from a callable function. The real shape of the # variable will be set later, since under the init_from_fn case, the # shape won't be known until after the function is invoked. if init_from_fn: self._variable = state_ops.variable_op([], dtype.base_dtype, set_shape=False, name=name) with ops.colocate_with(self._variable.op): with ops.name_scope("Initializer"): # Colocate the tensors created by the initial_value() function # with the variable itself. self._initial_value = ops.convert_to_tensor( initial_value(), name="initial_value", dtype=dtype) # Or get the initial value from a Tensor or Python object. else: self._initial_value = ops.convert_to_tensor( initial_value, name="initial_value", dtype=dtype) # In this case, the variable op can't be created until after the # initial_value has been converted to a Tensor with a known type. self._variable = state_ops.variable_op( [], self._initial_value.dtype.base_dtype, set_shape=False, name=name) # Manually overrides the variable's shape with the initial value's. if validate_shape: initial_value_shape = self._initial_value.get_shape() if not initial_value_shape.is_fully_defined(): raise ValueError( "initial_value must have a shape specified: %s" % self._initial_value) self._variable.set_shape(initial_value_shape) # TODO(b/28152992): Remove the below hack modifying the node_def shape # directly once set_shape() handles it. self._variable.op.node_def.attr["shape"].shape.CopyFrom( initial_value_shape.as_proto()) # Assigns initial value. self._initializer_op = state_ops.assign( self._variable, self._initial_value, validate_shape=validate_shape).op # TODO(vrv): Change this class to not take caching_device, but # to take the op to colocate the snapshot with, so we can use # colocation rather than devices. if caching_device is not None: with ops.device(caching_device): self._snapshot = array_ops.identity(self._variable, name="read") else: with ops.colocate_with(self._variable.op): self._snapshot = array_ops.identity(self._variable, name="read") ops.add_to_collections(collections, self) self._caching_device = caching_device self._save_slice_info = None
def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers, total_counts): """Creates an op for training for mini batch case. Args: inputs: list of input Tensors. cluster_idx_list: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. cluster_centers: Tensor Ref of cluster centers. total_counts: Tensor Ref of cluster counts. Returns: An op for doing an update of mini-batch k-means. """ update_ops = [] for inp, cluster_idx in zip(inputs, cluster_idx_list): with ops.colocate_with(inp): assert total_counts is not None cluster_idx = array_ops.reshape(cluster_idx, [-1]) # Dedupe the unique ids of cluster_centers being updated so that updates # can be locally aggregated. unique_ids, unique_idx = array_ops.unique(cluster_idx) num_unique_cluster_idx = array_ops.size(unique_ids) # Fetch the old values of counts and cluster_centers. with ops.colocate_with(total_counts): old_counts = array_ops.gather(total_counts, unique_ids) # TODO(agarwal): This colocation seems to run into problems. Fix it. # with ops.colocate_with(cluster_centers): old_cluster_centers = array_ops.gather(cluster_centers, unique_ids) # Locally aggregate the increment to counts. count_updates = math_ops.unsorted_segment_sum( array_ops.ones_like( unique_idx, dtype=total_counts.dtype), unique_idx, num_unique_cluster_idx) # Locally compute the sum of inputs mapped to each id. # For a cluster with old cluster value x, old count n, and with data # d_1,...d_k newly assigned to it, we recompute the new value as # x += (sum_i(d_i) - k * x) / (n + k). # Compute sum_i(d_i), see comment above. cluster_center_updates = math_ops.unsorted_segment_sum( inp, unique_idx, num_unique_cluster_idx) # Shape to enable broadcasting count_updates and learning_rate to inp. # It extends the shape with 1's to match the rank of inp. broadcast_shape = array_ops.concat( [ array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones( array_ops.reshape(array_ops.rank(inp) - 1, [1]), dtype=dtypes.int32) ], 0) # Subtract k * x, see comment above. cluster_center_updates -= math_ops.cast( array_ops.reshape(count_updates, broadcast_shape), inp.dtype) * old_cluster_centers learning_rate = math_ops.reciprocal( math_ops.cast(old_counts + count_updates, inp.dtype)) learning_rate = array_ops.reshape(learning_rate, broadcast_shape) # scale by 1 / (n + k), see comment above. cluster_center_updates *= learning_rate # Apply the updates. update_counts = state_ops.scatter_add( total_counts, unique_ids, count_updates) update_cluster_centers = state_ops.scatter_add( cluster_centers, unique_ids, cluster_center_updates) update_ops.extend([update_counts, update_cluster_centers]) return control_flow_ops.group(*update_ops)
def value(self): """A cached operation which reads the value of this variable.""" if self._cached_value is not None: return self._cached_value with ops.colocate_with(None, ignore_existing=True): return self._read_variable_op()
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). grads_and_vars = tuple( grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for g, v in grads_and_vars: if g is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. g = ops.convert_to_tensor_or_indexed_slices(g) except TypeError: raise TypeError("Gradient must be convertible to a Tensor" " or IndexedSlices, or None: %s" % g) if not isinstance(g, (ops.Tensor, ops.IndexedSlices)): raise TypeError( "Gradient must be a Tensor, IndexedSlices, or None: %s" % g) p = _get_processor(v) converted_grads_and_vars.append((g, v, p)) converted_grads_and_vars = tuple(converted_grads_and_vars) var_list = [v for g, v, _ in converted_grads_and_vars if g is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(v) for _, _, v in converted_grads_and_vars], )) with ops.init_scope(): self._create_slots([_get_variable_for(v) for v in var_list]) update_ops = [] with ops.name_scope(name, self._name) as name: self._prepare() for grad, var, processor in converted_grads_and_vars: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. scope_name = "" if context.executing_eagerly() else var.op.name with ops.name_scope("update_" + scope_name), ops.colocate_with(var): update_ops.append(processor.update_op(self, grad)) if global_step is None: apply_updates = self._finish(update_ops, name) else: with ops.control_dependencies( [self._finish(update_ops, "update")]): with ops.colocate_with(global_step): if isinstance(global_step, resource_variable_ops.ResourceVariable): # TODO(apassos): the implicit read in assign_add is slow; consider # making it less so. apply_updates = resource_variable_ops.assign_add_variable_op( global_step.handle, ops.convert_to_tensor(1, dtype=global_step.dtype), name=name) else: apply_updates = state_ops.assign_add(global_step, 1, name=name) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) return apply_updates
def _init_from_args( self, initial_value=None, trainable=None, collections=None, caching_device=None, name=None, dtype=None, constraint=None, synchronization=None, aggregation=None, distribute_strategy=None, shape=None, ): """Creates a variable. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, the default, also adds the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default list of variables to use by the `Optimizer` classes. Defaults to `True`, unless `synchronization` is set to `ON_READ`, in which case it defaults to `False`. collections: List of graph collections keys. The new variable is added to these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). constraint: An optional projection function to be applied to the variable after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected Tensor representing the value of the variable and return the Tensor for the projected value (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class `tf.VariableAggregation`. distribute_strategy: DistributionStrategy under which this variable was created. shape: (optional) The shape of this variable. If None, the shape of `initial_value` will be used. When setting this argument to `tf.TensorShape(None)` (representing an unspecified shape), the variable can be assigned with values of different shapes. Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. @compatibility(eager) When Eager Execution is enabled, variables are never added to collections. It is not implicitly added to the `GLOBAL_VARIABLES` or `TRAINABLE_VARIABLES` collections, and the `collections` argument is ignored. @end_compatibility """ ( synchronization, aggregation, trainable, ) = variables.validate_synchronization_aggregation_trainable( synchronization, aggregation, trainable, name) if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if (isinstance(initial_value, ops.Tensor) and hasattr(initial_value, "graph") and initial_value.graph.building_function): raise ValueError( "Tensor-typed variable initializers must either be " "wrapped in an init_scope or callable " "(e.g., `tf.Variable(lambda : " "tf.truncated_normal([10, 40]))`) when building " "functions. Please file a feature request if this " "restriction inconveniences you.") if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] if not isinstance(collections, (list, tuple, set)): raise ValueError( "collections argument to Variable constructor must be a list, tuple, " "or set. Got %s of type %s" % (collections, type(collections))) if constraint is not None and not callable(constraint): raise ValueError("The `constraint` argument must be a callable.") if isinstance(initial_value, trackable.CheckpointInitialValue): self._maybe_initialize_trackable() self._update_uid = initial_value.checkpoint_position.restore_uid initial_value = initial_value.wrapped_value if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ ops.GraphKeys.TRAINABLE_VARIABLES ] with ops.init_scope(): self._in_graph_mode = not context.executing_eagerly() with ops.name_scope( name, "TrainableWrapper", [] if init_from_fn else [initial_value]) as name: # pylint: disable=protected-access handle_name = ops.name_from_scope_name(name) handle_name = handle_name or "TrainableWrapperHandle" if self._in_graph_mode: shared_name = handle_name unique_id = shared_name else: # When in eager mode use a uid for the shared_name, to prevent # accidental sharing. unique_id = "%s_%d" % (handle_name, ops.uid()) shared_name = None # Never shared # Use attr_scope and device(None) to simulate the behavior of # colocate_with when the variable we want to colocate with doesn't # yet exist. device_context_manager = (ops.device if self._in_graph_mode else ops.NullContextmanager) attr = attr_value_pb2.AttrValue( list=attr_value_pb2.AttrValue.ListValue( s=[compat.as_bytes("loc:@%s" % handle_name)])) with ops.get_default_graph()._attr_scope({"_class": attr}): with ops.name_scope("Initializer"), device_context_manager( None): initial_value = ops.convert_to_tensor( initial_value() if init_from_fn else initial_value, name="initial_value", dtype=dtype, ) if shape is None: shape = initial_value.shape handle = resource_variable_ops.eager_safe_variable_handle( initial_value=initial_value, shape=None, # shape, shared_name=shared_name, name=name, graph_mode=self._in_graph_mode, ) # pylint: disable=protected-access if (self._in_graph_mode and initial_value is not None and initial_value.op._get_control_flow_context() is not None): raise ValueError( "Initializer for variable %s is from inside a control-flow " "construct, such as a loop or conditional. When creating a " "variable inside a loop or conditional, use a lambda as the " "initializer." % name) # pylint: enable=protected-access dtype = initial_value.dtype.base_dtype if self._in_graph_mode: with ops.name_scope("IsInitialized"): is_initialized_op = (gen_resource_variable_ops. var_is_initialized_op(handle)) if initial_value is not None: # pylint: disable=g-backslash-continuation with ops.name_scope("Assign") as n, ops.colocate_with( None, ignore_existing=True), ops.device( handle.device): # pylint: disable=protected-access initializer_op = gen_resource_variable_ops.assign_variable_op( handle, variables. _try_guard_against_uninitialized_dependencies( name, initial_value), name=n, ) # pylint: enable=protected-access # pylint: enable=g-backslash-continuation with ops.name_scope("Read"): # Manually assign reads to the handle's device to avoid log # messages. with ops.device(handle.device): with ops.control_dependencies([ gen_resource_variable_ops. assign_variable_op( handle, self.prefetch_values(), name="AssignBeforeInitRead", ) ]): value = gen_resource_variable_ops.read_variable_op( handle, dtype) graph_element = value if caching_device is not None: # Variables may be created in a tf.device() or ops.colocate_with() # context. At the same time, users would expect caching device to # be independent of this context, and/or would not expect the # current device context to be merged with the caching device # spec. Therefore we reset the colocation stack before creating # the cached value. Note that resetting the colocation stack will # also reset the device stack. with ops.colocate_with(None, ignore_existing=True): with ops.device(caching_device): cached_value = array_ops.identity(value) else: cached_value = None else: gen_resource_variable_ops.assign_variable_op( handle, initial_value) is_initialized_op = None initializer_op = None graph_element = None if caching_device: with ops.device(caching_device): with ops.control_dependencies([ gen_resource_variable_ops. assign_variable_op( handle, self.prefetch_values(), name="AssignBeforeInitRead", ) ]): cached_value = ( gen_resource_variable_ops.read_variable_op( handle, dtype)) else: cached_value = None if not context.executing_eagerly(): # Eager variables are only added to collections if they are part of an # eager variable store (otherwise in an interactive session they would # hog memory and cause OOM). This is done in ops/variable_scope.py. ops.add_to_collections(collections, self) elif ops.GraphKeys.GLOBAL_STEP in collections: ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self) initial_value = initial_value if self._in_graph_mode else None super(resource_variable_ops.ResourceVariable, self).__init__( trainable=trainable, shape=shape, dtype=dtype, handle=handle, synchronization=synchronization, constraint=constraint, aggregation=aggregation, distribute_strategy=distribute_strategy, name=name, unique_id=unique_id, handle_name=handle_name, graph_element=graph_element, initial_value=initial_value, initializer_op=initializer_op, is_initialized_op=is_initialized_op, cached_value=cached_value, )
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0. r = utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r)) d = utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d)) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" value = array_ops.identity(value) def _do_update(): # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving the value. # Make sure the weight is not updated until before r and d computation. with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average( var, value, self.renorm_momentum, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, self.renorm_momentum, zero_debias=False) return new_var / new_weight def _fake_update(): return array_ops.identity(var) return utils.smart_cond(training, _do_update, _fake_update) with ops.colocate_with(self.moving_mean): new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) with ops.colocate_with(self.moving_variance): new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def _embedding_lookup_and_transform(params, ids, partition_strategy="mod", name=None, max_norm=None, transform_fn=None): """Helper function for embedding_lookup and _compute_sampled_logits. This function is a generalization of embedding_lookup that optionally applies a caller-specified transformation to each embedding. This is done through the `transform_fn` argument. If provided, the function is applied to each partitioned tensor of retrieved embeddings, colocated with the embeddings. This function will be called with a single `Tensor` argument of the same type as the `params` tensor and should return a `Tensor`. The shape of the argument will be the same as `params` except for the size of the first dimension. The first dimension of the result's shape must be the same size as the argument's. Args: params: See embedding_lookup. ids: See embedding_lookup. partition_strategy: See embedding_lookup. name: See embedding_lookup. max_norm: See embedding_lookup. transform_fn: An optional function to apply to each retrieved embedding. If max_norm is provided, transform_fn is applied to the norm-limited embeddings. Returns: See embedding_lookup for details. Raises: ValueError: If `params` is empty. """ if params is None: raise ValueError("params must be specified") if isinstance(params, (list, tuple)) and not params: raise ValueError("Need at least one param") if isinstance(params, variables.PartitionedVariable): params = list(params) # Iterate to get the underlying Variables. if not isinstance(params, list): params = [params] with ops.name_scope(name, "embedding_lookup", params + [ids]) as name: np = len(params) # Number of partitions # Preserve the resource variable status to avoid accidental dense reads. if not any( isinstance(p, resource_variable_ops.BaseResourceVariable) for p in params): params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params") ids = ops.convert_to_tensor(ids, name="ids") if np == 1 and (not transform_fn or ids.get_shape().ndims == 1): with ops.colocate_with(params[0]): result = _clip( array_ops.gather(params[0], ids, name=name), ids, max_norm) if transform_fn: result = transform_fn(result) # Make sure the final result does not have colocation constraints on the # params. Similar to the case np > 1 where parallel_dynamic_stitch is # outside the scioe of all with ops.colocate_with(params[p]). return array_ops.identity(result) else: # Flatten the ids. There are two cases where we need to do this. # - There is more than one params tensor. # - There is a transform_fn and ids is not statically known to be 1-D. # We must flatten in this case because transform_fn expects a flat # tensor of embeddings. flat_ids = array_ops.reshape(ids, [-1]) original_indices = math_ops.range(array_ops.size(flat_ids)) # Create p_assignments and set new_ids depending on the strategy. if partition_strategy == "mod": p_assignments = flat_ids % np new_ids = flat_ids // np elif partition_strategy == "div": # Compute num_total_ids as the sum of dim-0 of params, then assign to # partitions based on a constant number of ids per partition. Optimize # if we already know the full shape statically. dim_0_size = tensor_shape.Dimension( tensor_shape.dimension_value(params[0].get_shape()[0])) for p in xrange(1, np): dim_0_size += tensor_shape.Dimension( tensor_shape.dimension_value(params[p].get_shape()[0])) if dim_0_size.value: num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype) else: dim_0_sizes = [] for p in xrange(np): param_p_dim = tensor_shape.dimension_value(params[p].get_shape()[0]) if param_p_dim is not None: dim_0_sizes.append(param_p_dim) else: with ops.colocate_with(params[p]): dim_0_sizes.append(array_ops.shape(params[p])[0]) num_total_ids = math_ops.reduce_sum( math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype)) ids_per_partition = num_total_ids // np extras = num_total_ids % np p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (flat_ids - extras) // ids_per_partition) # Emulate a conditional using a boolean indicator tensor new_ids = array_ops.where(p_assignments < extras, flat_ids % (ids_per_partition + 1), (flat_ids - extras) % ids_per_partition) else: raise ValueError("Unrecognized partition strategy: " + partition_strategy) # Cast partition assignments to int32 for use in dynamic_partition. # There really should not be more than 2^32 partitions. p_assignments = math_ops.cast(p_assignments, dtypes.int32) # Partition list of ids based on assignments into np separate lists gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np) # Similarly, partition the original indices. pindices = data_flow_ops.dynamic_partition(original_indices, p_assignments, np) # Do np separate lookups, finding embeddings for plist[p] in params[p] partitioned_result = [] for p in xrange(np): pids = gather_ids[p] with ops.device_v2(None): with ops.colocate_with(params[p]): result = array_ops.gather(params[p], pids) if transform_fn: # If transform_fn is provided, the clip_by_norm precedes # the transform and hence must be co-located. See below # for the counterpart if transform_fn is not provided. result = transform_fn(_clip(result, pids, max_norm)) partitioned_result.append(result) # Stitch these back together ret = data_flow_ops.parallel_dynamic_stitch( pindices, partitioned_result, name=name) # Determine the static element shape. if transform_fn is None: element_shape_s = params[0].get_shape()[1:] for p in params[1:]: element_shape_s = element_shape_s.merge_with(p.get_shape()[1:]) else: element_shape_s = ret.get_shape()[1:] # Compute the dynamic element shape. if element_shape_s.is_fully_defined(): element_shape_d = element_shape_s elif transform_fn is None: # It's important that we compute params[0].shape on the right device # to avoid data motion. with ops.colocate_with(params[0]): params_shape = array_ops.shape(params[0]) element_shape_d = params_shape[1:] else: element_shape_d = array_ops.shape(ret)[1:] # Reshape to reverse the flattening of ids. ret = array_ops.reshape( ret, array_ops.concat([array_ops.shape(ids), element_shape_d], 0)) # Normally the reshape is sufficient, but setting shape explicitly # teaches shape inference that params[1:].get_shape() matters # (in the case that transform_fn is None). ret.set_shape(ids.get_shape().concatenate(element_shape_s)) if not transform_fn: # If transform_fn was provided, the clip_by_norm was done above. ret = _clip(ret, ids, max_norm) return ret
def model_fn(): if 'CPU' in compute_device: tower_compute_device = '/device:CPU:0' else: tower_compute_device = ('/device:GPU:%d' % distribution_strategy_context. get_tower_context().tower_id) tower_compute_device = device_util.canonicalize( tower_compute_device) if 'CPU' in variable_device: tower_variable_device = '/device:CPU:0' else: tower_variable_device = ('/device:GPU:%d' % distribution_strategy_context. get_tower_context().tower_id) tower_variable_device = device_util.canonicalize( tower_variable_device) a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b self.assertEqual(a.device, tower_compute_device) self.assertEqual(b.device, tower_compute_device) self.assertEqual(c.device, tower_compute_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/device:GPU:2'): x = variable_scope.get_variable('x', initializer=10.0) x_add = x.assign_add(c) e = a + c self.assertEqual(device_util.canonicalize(x.device), tower_variable_device) self.assertEqual(x_add.device, x.device) self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2')) # The colocate_vars_with can override the distribution's device. with d.colocate_vars_with(x): y = variable_scope.get_variable('y', initializer=20.0) y_add = y.assign_add(x_add) self.assertEqual(device_util.canonicalize(y.device), tower_variable_device) self.assertEqual(y_add.device, y.device) self.assertEqual(y.device, x.device) z = variable_scope.get_variable('z', initializer=10.0) self.assertEqual(device_util.canonicalize(z.device), tower_variable_device) with ops.control_dependencies([y_add]): z_add = z.assign_add(y) with ops.control_dependencies([z_add]): f = z + c self.assertEqual(f.device, tower_compute_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1')) # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. with ops.colocate_with(x): u = variable_scope.get_variable('u', initializer=30.0) h = f + 1.0 self.assertEqual(device_util.canonicalize(u.device), tower_variable_device) self.assertEqual(device_util.canonicalize(x.device), h.device) return y_add, z_add, f
def _assign_singular_vector(self, variable, value): with K.name_scope('AssignSingularVector') as scope: with ops.colocate_with(variable): return state_ops.assign(variable, value, name=scope)
def fn(): with ops.colocate_with(b.op): c = math_ops.add(a, a, name="c") return c
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ if context.in_eager_mode(): raise RuntimeError("tf.gradients not supported in EAGER mode. Use " "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function( op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[ i] = control_flow_ops.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with( None, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def _zero_debias(unbiased_var, value, decay): """Compute the delta required for a debiased Variable. All exponential moving averages initialized with Tensors are initialized to 0, and therefore are biased to 0. Variables initialized to 0 and used as EMAs are similarly biased. This function creates the debias updated amount according to a scale factor, as in https://arxiv.org/abs/1412.6980. To demonstrate the bias the results from 0-initialization, take an EMA that was initialized to `0` with decay `b`. After `t` timesteps of seeing the constant `c`, the variable have the following value: ``` EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ... = c*(1 - b^t) ``` To have the true value `c`, we would divide by the scale factor `1 - b^t`. In order to perform debiasing, we use two shadow variables. One keeps track of the biased estimate, and the other keeps track of the number of updates that have occurred. Args: unbiased_var: A Variable representing the current value of the unbiased EMA. value: A Tensor representing the most recent value. decay: A Tensor representing `1-decay` for the EMA. Returns: The amount that the unbiased variable should be updated. Computing this tensor will also update the shadow variables appropriately. """ with variable_scope.variable_scope( unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope: with ops.colocate_with(unbiased_var): with ops.init_scope(): biased_initializer = init_ops.zeros_initializer( dtype=unbiased_var.dtype)(unbiased_var.get_shape()) local_step_initializer = init_ops.zeros_initializer() def _maybe_get_unique(name): """Get name for a unique variable, if not `reuse=True`.""" if variable_scope.get_variable_scope().reuse: return name vs_vars = [x.op.name for x in variable_scope.get_variable_scope().global_variables()] full_name = variable_scope.get_variable_scope().name + "/" + name if full_name not in vs_vars: return name idx = 1 while full_name + ("_%d" % idx) in vs_vars: idx += 1 return name + ("_%d" % idx) biased_var = variable_scope.get_variable( _maybe_get_unique("biased"), initializer=biased_initializer, trainable=False) local_step = variable_scope.get_variable( _maybe_get_unique("local_step"), shape=[], dtype=unbiased_var.dtype, initializer=local_step_initializer, trainable=False) # Get an update ops for both shadow variables. update_biased = state_ops.assign_sub(biased_var, (biased_var - value) * decay, name=scope.name) update_local_step = local_step.assign_add(1) # Compute the value of the delta to update the unbiased EMA. Make sure to # use the new values of the biased variable and the local step. with ops.control_dependencies([update_biased, update_local_step]): # This function gets `1 - decay`, so use `1.0 - decay` in the exponent. unbiased_ema_delta = (unbiased_var - biased_var.read_value() / (1 - math_ops.pow( 1.0 - decay, local_step.read_value()))) return unbiased_ema_delta
def _parse_tensor_or_dict(self, features): if isinstance(features, dict): keys = sorted(features.keys()) with ops.colocate_with(features[keys[0]]): features = array_ops.concat([features[k] for k in keys], 1) return features
def fn2(): with ops.colocate_with(b.op): c = constant_op.constant(3.0) self.assertEqual([b"loc:@a", b"loc:@b"], c.op.colocation_groups()) return c
def _GatherV2Grad(op, grad): """Gradient for GatherV2 op.""" # params can be large, so colocate the shape calculation with it. # # params can be very large for sparse model, array_ops.shape raises # exception on the Windows platform when any dimension is larger than # int32. params_shape is not used in optimizer apply_sparse gradients, # so it's fine to convert it back to int32 regardless of truncation. params = op.inputs[0] with ops.colocate_with(params): params_shape = array_ops.shape(params, out_type=ops.dtypes.int64) params_shape = math_ops.cast(params_shape, dtypes.int32) indices = op.inputs[1] indices_size = array_ops.expand_dims(array_ops.size(indices), 0) axis = op.inputs[2] axis_static = tensor_util.constant_value(axis) # For axis 0 gathers, build an appropriately shaped IndexedSlices. if axis_static == 0: if context.executing_eagerly(): params_tail_shape = params_shape.cpu()[1:] else: params_tail_shape = params_shape[1:] values_shape = array_ops.concat([indices_size, params_tail_shape], 0) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="Converting sparse IndexedSlices to a dense Tensor.*") values = array_ops.reshape(grad, values_shape) indices = array_ops.reshape(indices, indices_size) return [ops.IndexedSlices(values, indices, params_shape), None, None] outer_shape = params_shape[:axis] outer_dims = array_ops.size(outer_shape) inner_shape = params_shape[axis:][1:] inner_dims = array_ops.size(inner_shape) outer_axes_indices = math_ops.range(outer_dims) inner_axes_indices = math_ops.range(outer_dims + 1, outer_dims + 1 + inner_dims) values_shape = array_ops.concat([outer_shape, indices_size, inner_shape], 0) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="Converting sparse IndexedSlices to a dense Tensor.*") values = array_ops.reshape(grad, values_shape) indices = array_ops.reshape(indices, indices_size) # We need to sum up every slice `values[..., i, ....]` corresponding to # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not # support an axis parameter, we transpose the gather dimension to the front, # then use `unsorted_segment_sum` to build a # [gather_axis, outer_axes, inner_axes] tensor with all the gradients # affecting each index in `gather_axis` summed up. transpose_dims = array_ops.concat( [[outer_dims], outer_axes_indices, inner_axes_indices], 0) values_transpose = array_ops.transpose(values, transpose_dims) num_segments = params_shape[axis] params_grad = math_ops.unsorted_segment_sum(values_transpose, indices, num_segments) # Inverts the above transpose by moving dimension 0 back to its original # position. invert_transpose_dims = array_ops.concat( [outer_axes_indices + 1, [0], inner_axes_indices], 0) params_grad = array_ops.transpose(params_grad, invert_transpose_dims) return [params_grad, None, None]
def restore(self, restored_tensors, unused_restored_shapes): # pylint: disable=protected-access with ops.colocate_with(self.op._table_ref): return gen_lookup_ops._lookup_table_import_v2( self.op._table_ref, restored_tensors[0], restored_tensors[1])
def model_fn(): if num_gpus == 0: last_part_device = 'device:CPU:0' else: replica_id = _get_replica_id_integer() last_part_device = ('device:GPU:%d' % replica_id) a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b self.assertEqual(a.device, worker_device + '/' + last_part_device) self.assertEqual(b.device, worker_device + '/' + last_part_device) self.assertEqual(c.device, worker_device + '/' + last_part_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/job:worker/task:0'): x = variable_scope.get_variable( 'x', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) x_add = x.assign_add(c) e = a + c # The variable x is on the task 1 since the device_function has been # called once before the model_fn. self.assertEqual(x.device, '/job:ps/task:1') self.assertEqual(x_add.device, x.device) self.assertEqual( e.device, '/job:worker/replica:0/task:0/%s' % last_part_device) # The colocate_vars_with can override the distribution's device. with d.extended.colocate_vars_with(x): y = variable_scope.get_variable( 'y', initializer=20.0, aggregation=variable_scope.VariableAggregation.SUM) # We add an identity here to avoid complaints about summing # non-distributed values. y_add = y.assign_add(array_ops.identity(x_add)) self.assertEqual(y.device, '/job:ps/task:1') self.assertEqual(y_add.device, y.device) self.assertEqual(y.device, x.device) z = variable_scope.get_variable( 'z', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) self.assertEqual(z.device, '/job:ps/task:0') self.assertNotEqual(z.device, x.device) with ops.control_dependencies([y_add]): # We add an identity here to avoid complaints about summing # non-distributed values. z_add = z.assign_add(array_ops.identity(y)) with ops.control_dependencies([z_add]): f = z + c self.assertEqual(f.device, worker_device + '/' + last_part_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 self.assertEqual(g.device, worker_device + '/device:CPU:1') # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. with ops.colocate_with(x): u = variable_scope.get_variable('u', initializer=30.0) v = variable_scope.get_variable('v', initializer=30.0) h = f + 1.0 self.assertIn('/job:ps/', u.device) self.assertIn('/job:ps/', v.device) # u and v are on different parameter servers. self.assertTrue(u.device != x.device or v.device != x.device) self.assertTrue(u.device == x.device or v.device == x.device) # Here h is not on one worker. Note h.device is canonical while x.device # is not but. self.assertIn('/job:ps/', h.device) return y_add, z_add, f
def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. Args: global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Returns: An Operation that updates the variables passed in the constructor. """ # Technically, the op depends on a lot more than the variables, # but we'll keep the list short. with name_scope(name, 'sdca/minimize'): sparse_example_indices = [] sparse_feature_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: sparse_example_indices.append(sf.example_indices) sparse_feature_indices.append(sf.feature_indices) # If feature values are missing, sdca assumes a value of 1.0f. if sf.feature_values is not None: sparse_features_values.append(sf.feature_values) # pylint: disable=protected-access example_ids_hashed = gen_sdca_ops.sdca_fprint( internal_convert_to_tensor(self._examples['example_ids'])) # pylint: enable=protected-access example_state_data = self._hashtable.lookup(example_ids_hashed) # Solver returns example_state_update, new delta sparse_feature_weights # and delta dense_feature_weights. sparse_weights = [] sparse_indices = [] # If we have partitioned variables, keep a few dictionaries of Tensors # around that we need for the assign_add after the op call to # gen_sdca_ops.sdca_optimizer(). These are keyed because we may have a # mix of partitioned and un-partitioned variables. num_partitions_by_var = {} p_assignments_by_var = {} gather_ids_by_var = {} for v_num, (w, i) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_feature_indices)): # Append the sparse_indices (in full-variable space). sparse_idx = math_ops.cast( array_ops.unique(math_ops.cast(i, dtypes.int32))[0], dtypes.int64) sparse_indices.append(sparse_idx) if isinstance(w, list) or isinstance( w, var_ops.PartitionedVariable): num_partitions = len(w) flat_ids = array_ops.reshape(sparse_idx, [-1]) # We use div partitioning, which is easiest to support downstream. # Compute num_total_ids as the sum of dim-0 of w, then assign # to partitions based on a constant number of ids per partition. # Optimize if we already know the full shape statically. dim_0_size = self._get_first_dimension_size_statically( w, num_partitions) if dim_0_size.value: num_total_ids = constant_op.constant( dim_0_size.value, flat_ids.dtype) else: dim_0_sizes = [] for p in range(num_partitions): if w[p].get_shape()[0].value is not None: dim_0_sizes.append(w[p].get_shape()[0].value) else: with ops.colocate_with(w[p]): dim_0_sizes.append( array_ops.shape(w[p])[0]) num_total_ids = math_ops.reduce_sum( math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype)) ids_per_partition = num_total_ids // num_partitions extras = num_total_ids % num_partitions p_assignments = math_ops.maximum( flat_ids // (ids_per_partition + 1), (flat_ids - extras) // ids_per_partition) # Emulate a conditional using a boolean indicator tensor new_ids = array_ops.where( p_assignments < extras, flat_ids % (ids_per_partition + 1), (flat_ids - extras) % ids_per_partition) # Cast partition assignments to int32 for use in dynamic_partition. # There really should not be more than 2^32 partitions. p_assignments = math_ops.cast(p_assignments, dtypes.int32) # Partition list of ids based on assignments into num_partitions # separate lists. gather_ids = data_flow_ops.dynamic_partition( new_ids, p_assignments, num_partitions) # Add these into the dictionaries for use in the later update. num_partitions_by_var[v_num] = num_partitions p_assignments_by_var[v_num] = p_assignments gather_ids_by_var[v_num] = gather_ids # Gather the weights from each partition. partition_gathered_weights = [] for p in range(num_partitions): with ops.colocate_with(w[p]): partition_gathered_weights.append( array_ops.gather(w[p], gather_ids[p])) # Stitch the weights back together in the same order they were before # we dynamic_partitioned them. condition_indices = data_flow_ops.dynamic_partition( math_ops.range(array_ops.shape(new_ids)[0]), p_assignments, num_partitions) batch_gathered_weights = data_flow_ops.dynamic_stitch( condition_indices, partition_gathered_weights) else: w_as_tensor = internal_convert_to_tensor(w) with ops.device(w_as_tensor.device): batch_gathered_weights = array_ops.gather( w_as_tensor, sparse_idx) sparse_weights.append(batch_gathered_weights) # pylint: disable=protected-access if compat.forward_compatible(year=2018, month=10, day=30): esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor( self._examples['dense_features']), internal_convert_to_tensor( self._examples['example_weights']), internal_convert_to_tensor( self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor( self._slots['unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptive=self._adaptive()) else: esu, sfw, dfw = gen_sdca_ops.sdca_optimizer( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor( self._examples['dense_features']), internal_convert_to_tensor( self._examples['example_weights']), internal_convert_to_tensor( self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor( self._slots['unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptative=self._adaptive()) # pylint: enable=protected-access with ops.control_dependencies([esu]): update_ops = [self._hashtable.insert(example_ids_hashed, esu)] # Update the weights before the proximal step. for v_num, (w, i, u) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_indices, sfw)): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): update_ops += self._get_partitioned_update_ops( v_num, num_partitions_by_var, p_assignments_by_var, gather_ids_by_var, w, u, p_assignments, num_partitions) else: update_ops.append(state_ops.scatter_add(w, i, u)) for w, u in zip( self._slots['unshrinked_dense_features_weights'], dfw): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): split_updates = array_ops.split( u, num_or_size_splits=[ v.shape.as_list()[0] for v in w ]) for v, split_update in zip(w, split_updates): update_ops.append( state_ops.assign_add(v, split_update)) else: update_ops.append(state_ops.assign_add(w, u)) if not global_step: return control_flow_ops.group(*update_ops) with ops.control_dependencies(update_ops): return state_ops.assign_add(global_step, 1, name=name).op
def model_fn(): if 'CPU' in compute_device: replica_compute_device = '/device:CPU:0' else: replica_id = _get_replica_id_integer() replica_compute_device = ('/device:GPU:%d' % replica_id) replica_compute_device = device_util.canonicalize( replica_compute_device) if 'CPU' in variable_device: replica_variable_device = '/device:CPU:0' else: replica_id = _get_replica_id_integer() replica_variable_device = ('/device:GPU:%d' % replica_id) replica_variable_device = device_util.canonicalize( replica_variable_device) a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b self.assertEqual(a.device, replica_compute_device) self.assertEqual(b.device, replica_compute_device) self.assertEqual(c.device, replica_compute_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/device:GPU:2'): x = variable_scope.get_variable( 'x', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) x_add = x.assign_add(c) e = a + c self.assertEqual(device_util.canonicalize(x.device), replica_variable_device) self.assertEqual(x_add.device, x.device) self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2')) # The colocate_vars_with can override the distribution's device. with d.extended.colocate_vars_with(x): y = variable_scope.get_variable( 'y', initializer=20.0, aggregation=variable_scope.VariableAggregation.SUM) # We add an identity here to avoid complaints about summing # non-distributed values. y_add = y.assign_add(array_ops.identity(x_add)) self.assertEqual(device_util.canonicalize(y.device), replica_variable_device) self.assertEqual(y_add.device, y.device) self.assertEqual(y.device, x.device) z = variable_scope.get_variable( 'z', initializer=10.0, aggregation=variable_scope.VariableAggregation.SUM) self.assertEqual(device_util.canonicalize(z.device), replica_variable_device) with ops.control_dependencies([y_add]): # We add an identity here to avoid complaints about summing # non-distributed values. z_add = z.assign_add(array_ops.identity(y)) with ops.control_dependencies([z_add]): f = z + c self.assertEqual(f.device, replica_compute_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1')) # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. with ops.colocate_with(x): u = variable_scope.get_variable('u', initializer=30.0) h = f + 1.0 self.assertEqual(device_util.canonicalize(u.device), replica_variable_device) self.assertEqual(device_util.canonicalize(x.device), device_util.canonicalize(h.device)) return y_add, z_add, f
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. with ops.colocate_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_grad(grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def __init__( self, # pylint: disable=super-init-not-called initial_value=None, trainable=None, caching_device=None, name=None, dtype=None, constraint=None, add_initializers_to=None, lifted_initializer_graph=None, **unused_kwargs): """Creates a variable. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, GradientTapes automatically watch uses of this Variable. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). constraint: An optional projection function to be applied to the variable after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected Tensor representing the value of the variable and return the Tensor for the projected value (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. add_initializers_to: if not None and not in legacy graph mode, the initializer tensor will be added to this map in addition to adding the assignment to the function. lifted_initializer_graph: FuncGraph to try to lift initializers to. Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. RuntimeError: If called outside of a function definition. """ if context.executing_eagerly(): # If we've been init_scope()d out of the function definition nothing to do # here; we can't really do the capturing or conditional logic. resource_variable_ops.ResourceVariable.__init__( self, initial_value=initial_value, trainable=trainable, caching_device=caching_device, name=name, dtype=dtype, constraint=constraint) return with ops.init_scope(): self._in_graph_mode = not context.executing_eagerly() if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if constraint is not None and not callable(constraint): raise ValueError("The `constraint` argument must be a callable.") if isinstance(initial_value, checkpointable.CheckpointInitialValue): self._maybe_initialize_checkpointable() self._update_uid = initial_value.checkpoint_position.restore_uid initial_value = initial_value.wrapped_value if trainable is None: trainable = True self._trainable = trainable self._save_slice_info = None self._initial_value = None self._initializer_op = None self._is_initialized_op = None self._graph_element = None self._cached_value = None # Store the graph key so optimizers know how to only retrieve variables from # this graph. Guaranteed to be the same as the eager graph_key. self._graph_key = ops.get_default_graph()._graph_key # pylint: disable=protected-access with ops.name_scope(name, "Variable", [] if init_from_fn else [initial_value]) as name: # pylint: disable=protected-access with ops.init_scope(): handle_name = ops._name_from_scope_name(name) unique_id = "%s_%d" % (handle_name, ops.uid()) shared_name = context.shared_name(unique_id) with ops.name_scope("Initializer"), ops.device(None): initial_value = ops.convert_to_tensor( initial_value() if init_from_fn else initial_value, name="initial_value", dtype=dtype) with ops.init_scope(): self._handle = resource_variable_ops.eager_safe_variable_handle( initial_value=initial_value, shared_name=shared_name, name=name, graph_mode=self._in_graph_mode) self._shape = initial_value.shape self._unique_id = unique_id self._handle_name = handle_name + ":0" self._dtype = initial_value.dtype.base_dtype self._constraint = constraint assert initial_value is not None if self._in_graph_mode: with ops.init_scope(): outer_graph = ops.get_default_graph() lifted_initializer = lift_to_graph.lift_to_graph( initial_value, outer_graph)[initial_value] with ops.init_scope(): self._initial_value = lifted_initializer with ops.name_scope("IsInitialized"): self._is_initialized_op = ( resource_variable_ops.var_is_initialized_op( self._handle)) if initial_value is not None: with ops.name_scope("Assign") as n, ops.colocate_with( self._handle): self._initializer_op = resource_variable_ops.assign_variable_op( self._handle, lifted_initializer, name=n) with ops.name_scope("Read"), ops.colocate_with( self._handle): # Manually assign reads to the handle's device to avoid log # messages. with ops.device(self._handle.device): value = self._read_variable_op() self._graph_element = value ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self) else: if add_initializers_to is not None: add_initializers_to[self] = initial_value def assign_fn(): with ops.name_scope("Assign") as n, ops.colocate_with( self._handle): resource_variable_ops.assign_variable_op(self._handle, initial_value, name=n) # Returning values to keep tf.cond happy. return ops.convert_to_tensor(1) def not_assign_fn(): return ops.convert_to_tensor(0) # Note: this cond is always guaranteed to run because we're inside a # defun which will insert automatic control dependencies. control_flow_ops.cond( resource_variable_ops.var_is_initialized_op(self._handle), not_assign_fn, assign_fn) # After the handle has been created, set up a way to clean it up when # executing eagerly. We'll hold the only reference to the deleter, so that # when this object is garbage collected the deleter will be too. This # means ResourceVariables can be part of reference cycles without those # cycles being uncollectable. if not self._in_graph_mode: self._handle_deleter = resource_variable_ops.EagerResourceDeleter( handle=self._handle, handle_device=self._handle.device) self._cached_shape_as_list = None
def _create_variable(self, next_creator, **kwargs): """Implements StrategyExtendedV2._create_variable. Creates a `Variable` or a `ShardedVariable`. A `ShardedVariable` will be created if satisfying all the following criteria: 1. `self._variable_partitioner` results in more than one partition on the first axis. 2. variable's rank is greater than 0. 3. variable is not colocated with another variable. Otherwise a `Variable` will be created. Args: next_creator: See `variable_scope.variable_creator_scope`; the next creator in the chain. **kwargs: Passed through to the next creator. Returns: A `Variable` or `ShardedVariable`. """ var_creator = self._create_var_creator(next_creator, **kwargs) if "colocate_with" in kwargs: # Never partition colocated_with variables. colocate_with = kwargs["colocate_with"] # Clear the variable scope to avoid possible conflicts between device # scope and colocation scope. with ops.device(None): with ops.colocate_with(colocate_with): var = var_creator(**kwargs) logging.debug( "Creating variable (name:%s, shape:%r) that colocates with %s", var.name, var.shape, kwargs["colocate_with"].name) return var if self._variable_partitioner is None: return self._create_variable_round_robin(var_creator, **kwargs) name = kwargs.get("name", None) dtype = kwargs.get("dtype", None) shape = kwargs.get("shape", None) initial_value = kwargs.get("initial_value", None) if initial_value is None: # If we are loading, next_creator will return an UninitializedVariable v = next_creator(**kwargs) if not isinstance(v, resource_variable_ops.UninitializedVariable): raise ValueError( "It looks like you are using `ParameterServerStrategy` with a " "`variable_partitioner`, and trying to create a variable without " "specifying `initial_value`. This is not allowed. Please specify the " "`initial_value`.") elif shape is None or dtype is None: raise ValueError( "It looks like you are trying to load a `SavedModel` using " "`tf.saved_model.load` within a `ParameterServerStrategy` scope, " "but the `SavedModel` is missing shape or dtype information.") else: def initializer(shape, dtype, **kwargs): if "partition_shape" in kwargs: shape = kwargs["partition_shape"] return array_ops.zeros(shape, dtype) initial_value = functools.partial(initializer, shape=shape, dtype=dtype) # Two cases where initial_value can be a callable: # 1. initial_value is passed as a callable, e.g, an `initializer` class. # 2. restoring from checkpoint, initial_value is a # "CheckpointInitialValueCallable". init_from_fn = callable(initial_value) if init_from_fn and (shape is None or dtype is None): init_from_fn = False initial_value = initial_value() if not init_from_fn: # The initial_value is created on coordinator, it will need to be sent to # ps for variable initialization, which can be inefficient and can # potentially hit the 2GB limit on protobuf serialization. initial_value = ops.convert_to_tensor(initial_value, dtype=dtype) dtype = initial_value.dtype shape = initial_value.shape else: shape = tensor_shape.as_shape(shape) if shape.rank == 0: # Skip partitioning rank-0 variable. return self._create_variable_round_robin(var_creator, **kwargs) num_partitions = self._variable_partitioner(shape=shape, dtype=dtype) if not num_partitions or num_partitions[0] == 0 or any( v != 1 for v in num_partitions[1:]): raise ValueError( "variable_partitioner must return a list/tuple whose elements are 1" " besides the first element (non-zero), got: %r" % num_partitions) if num_partitions[0] == 1: # no partition return self._create_variable_round_robin(var_creator, **kwargs) # Use "div" partition strategy to partition the variable. num_partitions = min(num_partitions[0], shape[0]) base = shape[0] // num_partitions extra = shape[0] % num_partitions # An example: num_partitions=4, shape[0]=10, partitions: [3, 3, 2, 2] # offsets: [0, 3, 6, 8, 10] offsets = [] for i in range(num_partitions): if i == 0: offsets.append(0) else: prev_shard_size = base + (1 if i - 1 < extra else 0) offsets.append(offsets[i - 1] + prev_shard_size) offsets.append(shape[0]) def init_shard_fn(shard_index): if not init_from_fn: logging.log_if( logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS) return initial_value[offsets[shard_index]:offsets[shard_index + 1]] partition_shape = (offsets[shard_index + 1] - offsets[shard_index],) + shape[1:] partition_offset = (offsets[shard_index],) + (0,) * len(shape[1:]) arg_spec = tf_inspect.getfullargspec(initial_value) if ("shard_info" not in arg_spec.args and "shard_info" not in arg_spec.kwonlyargs): try: value = initial_value( partition_shape=partition_shape, partition_offset=partition_offset) except (TypeError, ValueError): # TypeError: Initializer doesn't accept kwargs # ValueError: Initializer doesn't accept partition kwargs # In both cases we go ahead creating the full value and then slice. value = initial_value() if value.shape == partition_shape: # Initializer supports partition: value is the partition value. return value else: # Initializer doesn't support partition: value is the full value # and needs to be sliced to get the partition value. logging.log_if( logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS) return value[offsets[shard_index]:offsets[shard_index + 1]] else: # For compatibility with `CheckpointInitialValueCallable`. return initial_value( shard_info=trackable.ShardInfo( shape=tensor_shape.as_shape(partition_shape), offset=partition_offset)) var_list = [] for i in range(num_partitions): kwargs["shape"] = (offsets[i + 1] - offsets[i],) + shape[1:] kwargs["initial_value"] = lambda: init_shard_fn(i) if name is not None: kwargs["name"] = "{}/part_{}".format(name, i) var_list.append(self._create_variable_round_robin(var_creator, **kwargs)) result = sharded_variable.ShardedVariable(var_list) return result
def _stitch(values, indices): if len(values) == 1: return values[0] with ops.colocate_with(indices[0], ignore_existing=True): all_values = data_flow_ops.dynamic_stitch(indices, values) return all_values
def __init__(self, initial_value=None, trainable=None, caching_device=None, name=None, dtype=None, constraint=None, add_initializers_to=None, lifted_initializer_graph=None, synchronization=None, aggregation=None, shape=None, **unused_kwargs): """Creates a variable. Args: initial_value: A `Tensor`, or Python object convertible to a `Tensor`, which is the initial value for the Variable. The initial value must have a shape specified unless `validate_shape` is set to False. Can also be a callable with no argument that returns the initial value when called. (Note that initializer functions from init_ops.py must first be bound to a shape before being used here.) trainable: If `True`, GradientTapes automatically watch uses of this Variable. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. If not `None`, caches on another device. Typical use is to cache on the device where the Ops using the Variable reside, to deduplicate copying through `Switch` and other conditional statements. name: Optional name for the variable. Defaults to `'Variable'` and gets uniquified automatically. dtype: If set, initial_value will be converted to the given type. If None, either the datatype will be kept (if initial_value is a Tensor) or float32 will be used (if it is a Python object convertible to a Tensor). constraint: An optional projection function to be applied to the variable after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected Tensor representing the value of the variable and return the Tensor for the projected value (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. add_initializers_to: if not None and not in legacy graph mode, the initializer tensor will be added to this map in addition to adding the assignment to the function. lifted_initializer_graph: FuncGraph to try to lift initializers to. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class `tf.VariableAggregation`. shape: (optional) The shape of this variable. If None, the shape of `initial_value` will be used. When setting this argument to `tf.TensorShape(None)` (representing an unspecified shape), the variable can be assigned with values of different shapes. Raises: ValueError: If the initial value is not specified, or does not have a shape and `validate_shape` is `True`. RuntimeError: If called outside of a function definition. """ if not ops.inside_function(): # If we've been init_scope()d out of the function definition nothing to do # here; we can't really do the capturing or conditional logic. resource_variable_ops.ResourceVariable.__init__( self, initial_value=initial_value, trainable=trainable, caching_device=caching_device, name=name, dtype=dtype, constraint=constraint) return if initial_value is None: raise ValueError("initial_value must be specified.") init_from_fn = callable(initial_value) if constraint is not None and not callable(constraint): raise ValueError("The `constraint` argument must be a callable.") if isinstance(initial_value, trackable.CheckpointInitialValue): self._maybe_initialize_trackable() self._update_uid = initial_value.checkpoint_position.restore_uid initial_value = initial_value.wrapped_value with ops.name_scope(name, "Variable", [] if init_from_fn else [initial_value]) as scope_name: with ops.name_scope("Initializer"), ops.device(None): initial_value = ops.convert_to_tensor( initial_value() if init_from_fn else initial_value, name="initial_value", dtype=dtype) assert initial_value is not None # Don't use `shape or initial_value.shape` since TensorShape has # overridden `__bool__`. if shape is None: shape = initial_value.shape # Use the constructor for UninitializedVariable to start. Outside the name # scope so we don't double up the prefix. super(UnliftedInitializerVariable, self).__init__( trainable=trainable, caching_device=caching_device, name=name, shape=shape, dtype=initial_value.dtype, constraint=constraint, synchronization=synchronization, aggregation=aggregation, extra_handle_data=initial_value, **unused_kwargs) with ops.name_scope(scope_name): if self._in_graph_mode: with ops.init_scope(): outer_graph = ops.get_default_graph() func_graph = ops.get_default_graph() function_placeholders = ( func_graph.inputs + func_graph.internal_captures) placeholder_ops = set( [tensor.op for tensor in function_placeholders]) lifted_initializer = lift_to_graph.lift_to_graph( [initial_value], outer_graph, disallowed_placeholders=placeholder_ops)[initial_value] with ops.init_scope(): self._initial_value = lifted_initializer with ops.name_scope("IsInitialized"): self._is_initialized_op = ( resource_variable_ops.var_is_initialized_op(self._handle)) if initial_value is not None: with ops.name_scope("Assign") as n, ops.colocate_with(self._handle): self._initializer_op = resource_variable_ops.assign_variable_op( self._handle, lifted_initializer, name=n) else: if add_initializers_to is not None: add_initializers_to[self] = initial_value def assign_fn(): with ops.name_scope("Assign") as n, ops.colocate_with(self._handle): resource_variable_ops.assign_variable_op( self._handle, initial_value, name=n) # Returning values to keep tf.cond happy. return ops.convert_to_tensor(1) def not_assign_fn(): return ops.convert_to_tensor(0) # Note: this cond is always guaranteed to run because we're inside a # defun which will insert automatic control dependencies. control_flow_ops.cond( resource_variable_ops.var_is_initialized_op(self._handle), not_assign_fn, assign_fn)
def pack(self, grouped_grads_and_vars): """Pack tensors.""" self.grouped_grads_and_vars = grouped_grads_and_vars self.all_tower_shapes = [] self.all_tower_sizes = [] device_grad_packs = [] for tower_grads_and_vars in grouped_grads_and_vars: with ops.colocate_with(tower_grads_and_vars[0][0]): # Flatten all the grads. flat_grads = [ array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars ] # Remember the original shape of all the grads. tower_shapes = [ array_ops.shape(g) for g, _ in tower_grads_and_vars ] # Remember the original sizes of all the grads. tower_sizes = [ array_ops.size(g) for g, _ in tower_grads_and_vars ] # Concat all the flat grads into a big flat tensor. concat_grads = array_ops.concat(flat_grads, 0) # Split the big tensor into num_splits packs. In cases where the # total size is not divisible num_splits, the last pack gets # more elements. # TODO(zhengxq): it is also possible to optimize away all the concat # as well. num_splits = self.num_packs # The array_ops.size function will sometimes remove static shapes. So if # all gradient shapes are defined, we use another method to get the # total size. # TODO(yuefengz): move this logic to array_ops.size. if all([ g.shape.is_fully_defined() for g, _ in tower_grads_and_vars ]): total_grad_size = sum([ g.shape.num_elements() for g, _ in tower_grads_and_vars ]) else: total_grad_size = array_ops.size(concat_grads) split_size = total_grad_size // num_splits split_size_last = total_grad_size - split_size * (num_splits - 1) split_sizes = [split_size] * (num_splits - 1) + [ split_size_last ] grad_packs = array_ops.split(concat_grads, split_sizes) # Ready to aggregate the repacked gradients, with fake variables. # TODO(zhengxq): It is hacky to have to use fake variables. # We should remove the need for variables in # aggregate_gradients_using*. device_grad_packs.append(zip(grad_packs, [None] * num_splits)) self.all_tower_shapes.append(tower_shapes) self.all_tower_sizes.append(tower_sizes) return device_grad_packs