def _logits_cumulative(self, inputs, stop_gradient): """Evaluate logits of the cumulative densities. Args: inputs: The values at which to evaluate the cumulative densities, expected to be a `Tensor` of shape `(channels, 1, batch)`. stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so that the gradient of the output with respect to the density model parameters is disconnected (the gradient with respect to `inputs` is left untouched). Returns: A `Tensor` of the same shape as `inputs`, containing the logits of the cumulative densities evaluated at the given inputs. """ logits = inputs for i in range(len(self.filters) + 1): matrix = self._matrices[i] if stop_gradient: matrix = array_ops.stop_gradient(matrix) logits = math_ops.matmul(matrix, logits) bias = self._biases[i] if stop_gradient: bias = array_ops.stop_gradient(bias) logits += bias if i < len(self._factors): factor = self._factors[i] if stop_gradient: factor = array_ops.stop_gradient(factor) logits += factor * math_ops.tanh(logits) return logits
def _run_test(self, x_, use_deferred_shape=False, **kwargs): x_ = np.asarray(x_) with self.cached_session() as sess: static_shape = None if use_deferred_shape else x_.shape x_pl = array_ops.placeholder_with_default(x_, shape=static_shape) # Add `zeros_like(x)` such that x's value and gradient are identical. We # do this so we can ensure each gradient value is mapped to the right # gradient location. (Not doing this means the gradient wrt `x` is simple # `ones_like(x)`.) # Note: # zeros_like_x_pl == zeros_like(x_pl) # gradient(zeros_like_x_pl, x_pl) == x_pl - 1 zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.) - array_ops.stop_gradient(x_pl * (x_pl - 1.))) x = x_pl + zeros_like_x_pl actual = du.fill_triangular(x, **kwargs) grad_actual = gradients_impl.gradients(actual, x_pl)[0] [actual_, grad_actual_] = sess.run([actual, grad_actual], feed_dict={x_pl: x_}) expected = self._fill_triangular(x_, **kwargs) if use_deferred_shape: self.assertEqual(None, actual.shape) else: self.assertAllEqual(expected.shape, actual.shape) self.assertAllClose(expected, actual_, rtol=1e-8, atol=1e-9) self.assertAllClose(x_, grad_actual_, rtol=1e-8, atol=1e-9)
def _create_value(self): """Create the value Tensor based on the value type, store as self._value.""" if isinstance(self._value_type, MeanValue): value_tensor = self._dist.mean() elif isinstance(self._value_type, SampleValue): value_tensor = self._dist.sample(self._value_type.shape) else: raise TypeError("Unrecognized Distribution Value Type: %s", self._value_type) if self._value_type.stop_gradient: # stop_gradient is being enforced by the value type return array_ops.stop_gradient(value_tensor) if isinstance(self._value_type, MeanValue): return value_tensor # Using pathwise-derivative for this one. if self._dist.is_continuous and ( self._dist.reparameterization_type is distribution.FULLY_REPARAMETERIZED): return value_tensor # Using pathwise-derivative for this one. else: # Will have to perform some variant of score function # estimation. Call stop_gradient on the sampler just in case we # may accidentally leak some gradient from it. return array_ops.stop_gradient(value_tensor)
def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None): """Estimates the largest singular value in the weight tensor. Args: w_tensor: The weight matrix whose spectral norm should be computed. power_iteration_rounds: The number of iterations of the power method to perform. A higher number yields a better approximation. name: An optional scope name. Returns: The largest singular value (the spectral norm) of w. """ with variable_scope.variable_scope(name, 'spectral_norm'): # The paper says to flatten convnet kernel weights from # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to # (KH * KW * C_in, C_out), and similarly for other layers that put output # channels as last dimension. # n.b. this means that w here is equivalent to w.T in the paper. w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1])) # Persisted approximation of first left singular vector of matrix `w`. u_var = variable_scope.get_variable( _PERSISTED_U_VARIABLE_SUFFIX, shape=(w.shape[0], 1), dtype=w.dtype, initializer=init_ops.random_normal_initializer(), trainable=False) u = u_var # Use power iteration method to approximate spectral norm. for _ in range(power_iteration_rounds): # `v` approximates the first right singular vector of matrix `w`. v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u)) u = nn.l2_normalize(math_ops.matmul(w, v)) # Update persisted approximation. with ops.control_dependencies([u_var.assign(u, name='update_u')]): u = array_ops.identity(u) u = array_ops.stop_gradient(u) v = array_ops.stop_gradient(v) # Largest singular value of `w`. spectral_norm = math_ops.matmul( math_ops.matmul(array_ops.transpose(u), w), v) spectral_norm.shape.assert_is_fully_defined() spectral_norm.shape.assert_is_compatible_with([1, 1]) return spectral_norm[0][0]
def _MakeGraph(rng, stop_gradients=()): def _FunctionOf(xs, k=3): return ops.convert_to_tensor( sum(math_ops.matmul(rng.rand(k, k), x) for x in xs) + rng.rand(k, k)) a = _FunctionOf([]) if "a" in stop_gradients: a = array_ops.stop_gradient(a) b = _FunctionOf([a]) if "b" in stop_gradients: b = array_ops.stop_gradient(b) c = _FunctionOf([a, b]) if "c" in stop_gradients: c = array_ops.stop_gradient(c) d = _FunctionOf([b, c]) if "d" in stop_gradients: d = array_ops.stop_gradient(d) return dict(a=a, b=b, c=c, d=d)
def _statistics(x, axes): """Calculate the mean and mean square of `x`. Modified from the implementation of `tf.nn.moments`. Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. Returns: Two `Tensor` objects: `mean` and `square mean`. """ # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x # Compute true mean while keeping the dims for proper broadcasting. shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True)) shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True) mean = shifted_mean + shift mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True) mean = array_ops.squeeze(mean, axes) mean_squared = array_ops.squeeze(mean_squared, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(mean_squared, dtypes.float16)) else: return (mean, mean_squared)
def surrogate_loss(sample_losses, stochastic_tensors=None, name="SurrogateLoss"): """Surrogate loss for stochastic graphs. This function will call `loss_fn` on each `StochasticTensor` upstream of `sample_losses`, passing the losses that it influenced. Note that currently `surrogate_loss` does not work with `StochasticTensor`s instantiated in `while_loop`s or other control structures. Args: sample_losses: a list or tuple of final losses. Each loss should be per example in the batch (and possibly per sample); that is, it should have dimensionality of 1 or greater. All losses should have the same shape. stochastic_tensors: a list of `StochasticTensor`s to add loss terms for. If None, defaults to all `StochasticTensor`s in the graph upstream of the `Tensor`s in `sample_losses`. name: the name with which to prepend created ops. Returns: `Tensor` loss, which is the sum of `sample_losses` and the `loss_fn`s returned by the `StochasticTensor`s. Raises: TypeError: if `sample_losses` is not a list or tuple, or if its elements are not `Tensor`s. ValueError: if any loss in `sample_losses` does not have dimensionality 1 or greater. """ with ops.op_scope(sample_losses, name): fixed_losses = [] if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims >= 1): raise ValueError("loss must have dimensionality 1 or greater: %s" % loss) fixed_losses.append(array_ops.stop_gradient(loss)) stoch_dependencies_map = _stochastic_dependencies_map( fixed_losses, stochastic_tensors=stochastic_tensors) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return math_ops.add_n(sample_losses) # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses] loss_terms = sample_losses for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): loss_term = stoch_node.loss(list(dependent_losses)) if loss_term is not None: loss_terms.append(loss_term) return math_ops.add_n(loss_terms)
def score_function(stochastic_tensor, value, loss, baseline=None, name="ScoreFunction"): """Score function estimator. Computes the integrand of the score function with a baseline: `p.log_prob(value) * (loss - baseline)`. It will add a `stop_gradient` to the advantage `(loss - baseline)`. Args: stochastic_tensor: `StochasticTensor` p(x). value: `Tensor` x. Samples from p(x). loss: `Tensor`. baseline: `Tensor` broadcastable to `loss`. name: name to prepend ops with. Returns: `Tensor` `p.log_prob(x) * (loss - b)`. Taking the gradient yields the score function estimator. """ with ops.name_scope(name, values=[value, loss, baseline]): value = ops.convert_to_tensor(value) loss = ops.convert_to_tensor(loss) if baseline is not None: baseline = ops.convert_to_tensor(baseline) advantage = loss - baseline else: advantage = loss advantage = array_ops.stop_gradient(advantage) return stochastic_tensor.distribution.log_prob(value) * advantage
def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" if dnn_to_tree_distillation_param: loss_weight, loss_fn = dnn_to_tree_distillation_param weight_tensor = head_lib._weight_tensor( # pylint: disable=protected-access features, head.weight_column_name) dnn_logits_fixed = array_ops.stop_gradient(dnn_logits) if loss_fn is None: # we create the loss_fn similar to the head loss_fn for # multi_class_head used previously as the default one. n_classes = 2 if head.logits_dimension == 1 else head.logits_dimension loss_fn = distillation_loss.create_dnn_to_tree_cross_entropy_loss_fn( n_classes) dnn_to_tree_distillation_loss = loss_weight * loss_fn( dnn_logits_fixed, tree_logits, weight_tensor) summary.scalar("dnn_to_tree_distillation_loss", dnn_to_tree_distillation_loss) loss += dnn_to_tree_distillation_loss update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op
def additional_score_function_losses(sample_losses, name=None): with ops.op_scope(sample_losses, name, "SampleLosses"): fixed_losses = [] if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims <= 1): raise ValueError( "loss must be a scalar or batch-length vector loss: %s" % loss) fixed_losses.append(array_ops.stop_gradient(loss)) stoch_dependencies_map = _stochastic_dependencies_map(fixed_losses) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return [] score_function_losses = [] # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): score_function = stoch_node.score_function(list(dependent_losses)) if score_function is not None: with ops.name_scope("ScoreFunction_%s" % stoch_node.name): score_function_losses.append(array_ops.identity(score_function)) return score_function_losses
def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False): """Given `inputs` tensors, stochastically resamples each at a given rate. For example, if the inputs are `[[a1, a2], [b1, b2]]` and the rates tensor contains `[3, 1]`, then the return value may look like `[[a1, a2, a1, a1], [b1, b2, b1, b1]]`. However, many other outputs are possible, since this is stochastic -- averaged over many repeated calls, each set of inputs should appear in the output `rate` times the number of invocations. Args: inputs: A list of tensors, each of which has a shape of `[batch_size, ...]` rates: A tensor of shape `[batch_size]` contiaining the resampling rates for each input. scope: Scope for the op. seed: Random seed to use. back_prop: Whether to allow back-propagation through this op. Returns: Selections from the input tensors. """ with ops.name_scope(scope, default_name='resample_at_rate', values=list(inputs) + [rates]): rates = ops.convert_to_tensor(rates, name='rates') # random_poisson does not support rates of size 0 (b/36076216) sample_counts = math_ops.cast(control_flow_ops.cond( array_ops.shape(rates)[0] > 0, lambda: random_ops.random_poisson(rates, (), rates.dtype, seed=seed), lambda: array_ops.zeros(shape=[0], dtype=rates.dtype)), dtypes.int32) sample_indices = _repeat_range(sample_counts) if not back_prop: sample_indices = array_ops.stop_gradient(sample_indices) return [array_ops.gather(x, sample_indices) for x in inputs]
def _logspace_mean(log_values): """Evaluate `Log[E[values]]` in a stable manner. Args: log_values: `Tensor` holding `Log[values]`. Returns: `Tensor` of same `dtype` as `log_values`, reduced across dim 0. `Log[Mean[values]]`. """ # center = Max[Log[values]], with stop-gradient # The center hopefully keep the exponentiated term small. It is cancelled # from the final result, so putting stop gradient on it will not change the # final result. We put stop gradient on to eliminate unnecessary computation. center = array_ops.stop_gradient(_sample_max(log_values)) # centered_values = exp{Log[values] - E[Log[values]]} centered_values = math_ops.exp(log_values - center) # log_mean_of_values = Log[ E[centered_values] ] + center # = Log[ E[exp{log_values - E[log_values]}] ] + center # = Log[E[values]] - E[log_values] + center # = Log[E[values]] log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center return log_mean_of_values
def _AvgPoolGradGrad(op, grad): return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool( grad, op.get_attr("ksize"), op.get_attr("strides"), op.get_attr("padding"), data_format=op.get_attr("data_format")))
def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) # prev= prev.get_shape().with_rank(2)[1] probs = tf.log(tf.nn.softmax(prev)) if i > 1: probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) symbols = indices % num_symbols # Which word in vocabulary. beam_parent = indices // num_symbols # Which hypothesis it came from. beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, symbols) emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) # emb_prev = embedding_ops.embedding_lookup(embedding, symbols) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol)
def softmax_cross_entropy( onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of shape `[batch_size]`, then the loss weights apply to each corresponding sample. If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes: new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes Args: onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels. logits: `[batch_size, num_classes]` logits outputs of the network . weights: Optional `Tensor` whose rank is either 0, or rank 1 and is broadcastable to the loss which is a `Tensor` of shape `[batch_size]`. label_smoothing: If greater than 0 then smooth the labels. scope: the scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss `Tensor` of the same type as `logits`. If `reduction` is `NONE`, this has shape `[batch_size]`; otherwise, it is scalar. Raises: ValueError: If the shape of `logits` doesn't match that of `onehot_labels` or if the shape of `weights` is invalid or if `weights` is None. Also if `onehot_labels` or `logits` is None. """ if onehot_labels is None: raise ValueError("onehot_labels must not be None.") if logits is None: raise ValueError("logits must not be None.") with ops.name_scope(scope, "softmax_cross_entropy_loss", (logits, onehot_labels, weights)) as scope: logits = ops.convert_to_tensor(logits) onehot_labels = math_ops.cast(onehot_labels, logits.dtype) logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape()) if label_smoothing > 0: num_classes = math_ops.cast( array_ops.shape(onehot_labels)[1], logits.dtype) smooth_positives = 1.0 - label_smoothing smooth_negatives = label_smoothing / num_classes onehot_labels = onehot_labels * smooth_positives + smooth_negatives onehot_labels = array_ops.stop_gradient( onehot_labels, name="labels_stop_gradient") losses = nn.softmax_cross_entropy_with_logits_v2( labels=onehot_labels, logits=logits, name="xentropy") return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def _create_value(self): """Create the value Tensor based on the value type, store as self._value.""" if isinstance(self._value_type, MeanValue): value_tensor = self._dist.mean() elif isinstance(self._value_type, SampleValue): value_tensor = self._dist.sample(self._value_type.n) elif isinstance(self._value_type, SampleAndReshapeValue): if self._value_type.n == 1: value_tensor = array_ops.squeeze(self._dist.sample(1), [0]) else: samples = self._dist.sample(self._value_type.n) samples_shape = array_ops.shape(samples) samples_static_shape = samples.get_shape() new_batch_size = samples_shape[0] * samples_shape[1] value_tensor = array_ops.reshape( samples, array_ops.concat(0, ([new_batch_size], samples_shape[2:]))) if samples_static_shape.ndims is not None: # Update the static shape for shape inference purposes shape_list = samples_static_shape.as_list() new_shape = tensor_shape.vector( shape_list[0] * shape_list[1] if shape_list[0] is not None and shape_list[1] is not None else None) new_shape = new_shape.concatenate(samples_static_shape[2:]) value_tensor.set_shape(new_shape) else: raise TypeError( "Unrecognized Distribution Value Type: %s", self._value_type) stop_gradient = self._value_type.stop_gradient if stop_gradient: # stop_gradient is being enforced by the value type return array_ops.stop_gradient(value_tensor) if isinstance(self._value_type, MeanValue): return value_tensor # Using pathwise-derivative for this one. if (isinstance(self._dist, distributions.ContinuousDistribution) and self._dist.is_reparameterized): return value_tensor # Using pathwise-derivative for this one. else: # Will have to perform some variant of score function # estimation. Call stop_gradient on the sampler just in case we # may accidentally leak some gradient from it. return array_ops.stop_gradient(value_tensor)
def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars, g_side_input): """Backprop for 1 layer.""" y1, y2 = ys grad_y1, grad_y2 = grad_ys # Reconstruct intermediates and inputs (x1, x2) # stop_gradients required on fn inputs to prevent infinite recursion into this # grad function on the calls to gradients. y1_stop = array_ops.stop_gradient(y1) g_side_input = [array_ops.stop_gradient(t) for t in g_side_input] gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop) x2 = y2 - gy1 x2_stop = array_ops.stop_gradient(x2) f_side_input = [array_ops.stop_gradient(t) for t in f_side_input] fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop) x1 = y1 - fx2 # Compute gradients wrt to inputs # dL/dy2 * dG(y1)/y1 grad_gy1_y2 = gradients_impl.gradients(gy1, y1_stop, grad_y2)[0] grad_x1 = grad_y1 + grad_gy1_y2 grad_x2 = ( gradients_impl.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + gradients_impl.gradients(fx2, x2_stop, grad_gy1_y2)[0]) # Compute gradients wrt to vars and side inputs in f and g grads1 = gradients_impl.gradients(gy1, g_vars + g_side_input, grad_y2) grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):] grads2 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_y1) grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):] grads3 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_gy1_y2) grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):] grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2) grad_f_side = _acc_grads(grad_f_side1, grad_f_side2) # Put returns in a tuple to ensure a constant memory budget (i.e. don't want # the subsequent layer to start computing and consuming memory based on a # subset of these values). outputs = ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side), (grad_g_vars, grad_g_side)) tupled = control_flow_ops.tuple(nest.flatten(outputs)) return nest.pack_sequence_as(outputs, tupled)
def evaluate(self): """Evaluate the loss function on the targets.""" if self.targets is not None: # We treat the targets as "constant". It's only the inputs that get # "back-propped" through. return self._evaluate(array_ops.stop_gradient(self.targets)) else: raise Exception("Cannot evaluate losses with unspecified targets.")
def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: RNNCell defining the cell function and size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x cell.output_size] containing generated outputs. states: The state of each cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with vs.variable_scope(scope or "rnn_decoder"): states = [initial_state] outputs = [] prev = None for i in xrange(len(decoder_inputs)): inp = decoder_inputs[i] if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): # We do not propagate gradients over the loop function. inp = array_ops.stop_gradient(loop_function(prev, i)) if i > 0: vs.get_variable_scope().reuse_variables() output, new_state = cell(inp, states[-1]) outputs.append(output) states.append(new_state) if loop_function is not None: prev = array_ops.stop_gradient(output) return outputs, states
def loop_function(prev, _): if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = math_ops.argmax(prev, 1) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def surrogate_losses(sample_losses, name="SurrogateLosses"): """Compute surrogate losses for StochasticTensors in the graph. This function will call `surrogate_loss` on each `StochasticTensor` in the graph and pass the losses in `sample_losses` that that `StochasticTensor` influenced. Note that currently `surrogate_losses` does not work with `StochasticTensor`s instantiated in `while_loop`s or other control structures. Args: sample_losses: a list or tuple of final losses. Each loss should be per example in the batch (and possibly per sample); that is, it should have dimensionality of 1 or greater. All losses should have the same shape. name: the name with which to prepend created ops. Returns: A list of surrogate losses. Raises: TypeError: if `sample_losses` is not a list or tuple, or if its elements are not `Tensor`s. ValueError: if any loss in `sample_losses` does not have dimensionality 1 or greater. """ with ops.op_scope(sample_losses, name): fixed_losses = [] if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims >= 1): raise ValueError("loss must have dimensionality 1 or greater: %s" % loss) fixed_losses.append(array_ops.stop_gradient(loss)) stoch_dependencies_map = _stochastic_dependencies_map(fixed_losses) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return [] surrogate_loss_losses = [] # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): surrogate_loss = stoch_node.surrogate_loss(list(dependent_losses)) if surrogate_loss is not None: with ops.name_scope("SurrogateLoss_%s" % stoch_node.name): surrogate_loss_losses.append(array_ops.identity(surrogate_loss)) return surrogate_loss_losses
def _run_test(self, x_, use_deferred_shape=False, **kwargs): x_ = np.asarray(x_) with self.cached_session() as sess: static_shape = None if use_deferred_shape else x_.shape x_pl = array_ops.placeholder_with_default(x_, shape=static_shape) zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.) - array_ops.stop_gradient(x_pl * (x_pl - 1.))) x = x_pl + zeros_like_x_pl actual = du.fill_triangular(x, **kwargs) inverse_actual = du.fill_triangular_inverse(actual, **kwargs) inverse_actual_ = sess.run( inverse_actual, feed_dict={x_pl: x_}) if use_deferred_shape: self.assertEqual(None, inverse_actual.shape) else: self.assertAllEqual(x_.shape, inverse_actual.shape) self.assertAllEqual(x_, inverse_actual_)
def testScanGradientWithPartStopGradient(self): a = variables.Variable(0.0, name="a") b = variables.Variable(0.0, name="b") elems = array_ops.zeros(5) l0, l1 = functional_ops.scan( lambda elem_, input_: (a, b), elems, initializer=(0., 0.)) loss = l0 + array_ops.stop_gradient(l1) grad = gradients_impl.gradients(ys=[loss], xs=[a, b]) with self.test_session(use_gpu=True) as sess: self.evaluate(variables.global_variables_initializer()) self.evaluate(grad)
def moments(x, axes, shift=None, name=None, keep_dims=False): """Calculate the mean and variance of `x`. The mean and variance are calculated by aggregating the contents of `x` across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean and variance of a vector. Note: for numerical stability, when shift=None, the true mean would be computed and used as shift. When using these moments for batch normalization (see `tf.nn.batch_normalization`): * for so-called "global normalization", used with convolutional filters with shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`. * for simple batch normalization pass `axes=[0]` (batch only). Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. shift: A `Tensor` containing the value by which to shift the data for numerical stability, or `None` in which case the true mean of the data is used as shift. A shift close to the true mean provides the most numerically stable results. name: Name used to scope the operations that compute the moments. keep_dims: produce moments with the same dimensionality as the input. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.name_scope(name, "moments", [x, axes, shift]): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x if shift is None: # Compute true mean while keeping the dims for proper broadcasting. shift = array_ops.stop_gradient( math_ops.reduce_mean(y, axes, keep_dims=True)) else: shift = math_ops.cast(shift, y.dtype) counts, m_ss, v_ss, shift = sufficient_statistics( y, axes, shift=shift, keep_dims=keep_dims, name=name) # Reshape shift as needed. shift = array_ops.reshape(shift, array_ops.shape(m_ss)) shift.set_shape(m_ss.get_shape()) with ops.control_dependencies([counts, m_ss, v_ss]): mean, variance = normalize_moments(counts, m_ss, v_ss, shift, name=name) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(variance, dtypes.float16)) else: return (mean, variance)
def loop_function(prev,_): if output_projection is not None: prev = nn_ops.xw_plus_b ( prev, output_projection[0], output_projection[1] ) tf_prev_symbol = batch_sample_with_temperature(prev) emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol) if not update_embedding : emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def _hessian_vector_product(ys, xs, v): """Multiply the Hessian of `ys` wrt `xs` by `v`. This is an efficient construction that uses a backprop-like approach to compute the product between the Hessian and another vector. The Hessian is usually too large to be explicitly computed or even represented, but this method allows us to at least multiply by it for the same big-O cost as backprop. Implicit Hessian-vector products are the main practical, scalable way of using second derivatives with neural networks. They allow us to do things like construct Krylov subspaces and approximate conjugate gradient descent. Example: if `y` = 1/2 `x`^T A `x`, then `hessian_vector_product(y, x, v)` will return an expression that evaluates to the same values as (A + A.T) `v`. Args: ys: A scalar value, or a tensor or list of tensors to be summed to yield a scalar. xs: A list of tensors that we should construct the Hessian over. v: A list of tensors, with the same shapes as xs, that we want to multiply by the Hessian. Returns: A list of tensors (or if the list would be length 1, a single tensor) containing the product between the Hessian and `v`. Raises: ValueError: `xs` and `v` have different length. """ # Validate the input length = len(xs) if len(v) != length: raise ValueError("xs and v must have the same length.") # First backprop grads = gradients(ys, xs) assert len(grads) == length elemwise_products = [ math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem)) for grad_elem, v_elem in zip(grads, v) if grad_elem is not None ] # Second backprop return gradients(elemwise_products, xs)
def evaluate_on_sample(self, seed=None): """Evaluates the log probability on a random sample. Args: seed: int or None. Random seed for this draw from the distribution. Returns: Log probability of sampled targets, summed across examples. """ if seed is None: seed = self._default_seed # We treat the targets as "constant". It's only the inputs that get # "back-propped" through. return self._evaluate(array_ops.stop_gradient(self.sample(seed)))
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ if data_format == b"NHWC": keep_dims = False reduce_axis = [0, 1, 2] else: keep_dims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims) mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keep_dims=keep_dims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return grad_x, grad_scale, grad_offset
def _force_data_dependency(first_compute, then_compute): """Force all of `then_compute` to depend on all of `first_compute`. Uses a dummy data dependency, which is useful when running on TPUs because XLA ignores control dependencies. Only supports float arguments. Args: first_compute: `list<Tensor>`. These will be made to run before the `Tensor`s `then_compute`. then_compute: `list<Tensor>`. These will run after all the `Tensor`s in `first_compute`. Returns: `list<Tensor>`, same length as `then_compute`. Raises: ValueError: if ranks are unknown or types are not floating. """ def _first_element(x): if x.get_shape().ndims is None: raise ValueError("Rank of Tensor %s must be known" % x) ndims = x.get_shape().ndims begin = framework_ops.convert_to_tensor([0] * ndims, dtype=dtypes.int32) size = framework_ops.convert_to_tensor([1] * ndims, dtype=dtypes.int32) return array_ops.reshape(array_ops.slice(x, begin, size), []) first_compute_sum = math_ops.add_n( [_first_element(x) for x in first_compute if x is not None]) dtype = first_compute_sum.dtype if not dtype.is_floating: raise ValueError("_force_data_dependency only supports floating dtypes.") epsilon = np.finfo(dtype.as_numpy_dtype).tiny zero = array_ops.stop_gradient(epsilon * first_compute_sum) return [ array_ops.identity(x) + zero if x is not None else None for x in then_compute ]
def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None): """Clips input while leaving gradient unaltered.""" with ops.name_scope(name, "clip_by_value_preserve_grad", [x, clip_value_min, clip_value_max]): clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max) return x + array_ops.stop_gradient(clip_x - x)
def testStopGradient(self): grad = backprop.gradients_function( lambda x: array_ops.stop_gradient(math_ops.argmax(x))) self.assertAllEqual(grad([0.0])[0], None)
def call(self, inputs, training=None): if training is None: training = K.learning_phase() if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching outputs = undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.shape ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.shape) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = tf_utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = tf_utils.smart_cond(training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = tf_utils.smart_cond(training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1 mean, variance = self._moments( math_ops.cast(inputs, self._param_dtype), reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = tf_utils.smart_cond(training, lambda: mean, lambda: ops.convert_to_tensor(moving_mean)) variance = tf_utils.smart_cond( training, lambda: variance, lambda: ops.convert_to_tensor(moving_variance)) if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True) new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True) else: new_mean, new_variance = mean, variance if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( new_mean, new_variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) if distribution_strategy_context.in_cross_replica_context(): strategy = distribution_strategy_context.get_strategy() def _do_update(var, value): """Compute the updates for mean and variance.""" return strategy.extended.update( var, self._assign_moving_average, (value, self.momentum), group=False) # We need to unwrap the moving_mean or moving_variance in the case of # training being false to match the output of true_fn and false_fn # in the smart cond. def mean_update(): true_branch = lambda: _do_update(self.moving_mean, new_mean) false_branch = lambda: strategy.unwrap(self.moving_mean) return tf_utils.smart_cond(training, true_branch, false_branch) def variance_update(): return tf_utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: strategy.unwrap(self.moving_variance)) else: def _do_update(var, value): """Compute the updates for mean and variance.""" return self._assign_moving_average(var, value, self.momentum) def mean_update(): true_branch = lambda: _do_update(self.moving_mean, new_mean) false_branch = lambda: self.moving_mean return tf_utils.smart_cond(training, true_branch, false_branch) def variance_update(): true_branch = lambda: _do_update(self.moving_variance, new_variance) false_branch = lambda: self.moving_variance return tf_utils.smart_cond(training, true_branch, false_branch) self.add_update(mean_update, inputs=True) self.add_update(variance_update, inputs=True) else: mean, variance = self.moving_mean, self.moving_variance mean = math_ops.cast(mean, inputs.dtype) variance = math_ops.cast(variance, inputs.dtype) if offset is not None: offset = math_ops.cast(offset, inputs.dtype) if scale is not None: scale = math_ops.cast(scale, inputs.dtype) # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing # math in float16 hurts validation accuracy of popular models like resnet. outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: outputs = undo_virtual_batching(outputs) return outputs
def loop_function(i, prev, state, log_beam_probs, beam_path, beam_symbols, path_lengthes, is_finished_beam): output_size = prev.get_shape().as_list()[-1] state_size = state.get_shape().as_list()[-1] if i == 1: # todo: prevだけではなくstateも分岐 probs = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) probs = tf.log(tf.nn.softmax(probs)) best_probs, indices = tf.nn.top_k(probs, beam_size) # initialize length and EOS flags for each beam. path_lengthes = tf.fill([batch_size, beam_size], 1.0) is_finished_beam = tf.fill([batch_size, beam_size], False) # expand previous states to beams. (e.g. batch_size=beam_size=2: [a, b] -> [a, a, b, b]) prev = tf.gather( prev, tf.tile(tf.expand_dims(tf.range(batch_size), dim=1), [1, beam_size])) # prev: [batch, beam, hidden] -> [batch * beam, hidden] prev = tf.reshape(prev, [-1, output_size]) state = tf.gather( state, tf.tile(tf.expand_dims(tf.range(batch_size), dim=1), [1, beam_size])) state = tf.reshape(state, [-1, state_size]) else: probs = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) probs = tf.log(tf.nn.softmax(probs)) probs = tf.reshape(probs, [-1, beam_size * num_symbols]) # divide probs by the length of each beam (length penalty) and select top-k. pl = tf.reshape(tile_from_beam_to_vocab(path_lengthes), [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs / pl, beam_size) symbols = indices % num_symbols beam_parent = indices // num_symbols beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) is_finished_beam = tf.logical_or( tf.gather_nd(is_finished_beam, divide_index_by_batch(beam_parent)), tf.equal(symbols, tf.constant(EOS_ID))) path_lengthes = tf.gather_nd(path_lengthes, divide_index_by_batch(beam_parent)) path_lengthes += tf.to_float(tf.logical_not(is_finished_beam)) beam_state = tf.gather_nd( tf.reshape(state, [batch_size, beam_size, state_size]), divide_index_by_batch(beam_parent)) emb_prev = embedding_ops.embedding_lookup(embedding, symbols) # [batch, beam, embedding] -> [batch * beam, embedding] beam_state = tf.reshape(beam_state, [-1, state_size]) emb_prev = tf.reshape(emb_prev, [-1, embedding_size]) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev, beam_state, path_lengthes, is_finished_beam
def score_function_with_advantage(dist_tensor, value, loss): with ops.name_scope(name, values=[value, loss]): advantage = advantage_fn(dist_tensor, loss) advantage = array_ops.stop_gradient(advantage) return dist_tensor.distribution.log_prob(value) * advantage
def __init__(self, **kwargs): np.random.seed(0) tf.set_random_seed(0) self.batch_size = kwargs.pop('batch_size') self.data_sets = kwargs.pop('data_sets') self.train_dir = kwargs.pop('train_dir', 'output') log_dir = kwargs.pop('log_dir', 'log') self.model_name = kwargs.pop('model_name') self.num_classes = kwargs.pop('num_classes') self.initial_learning_rate = kwargs.pop('initial_learning_rate') # if 'keep_probs' in kwargs: self.keep_probs = kwargs.pop('keep_probs') # else: self.keep_probs = None if 'mini_batch' in kwargs: self.mini_batch = kwargs.pop('mini_batch') else: self.mini_batch = True if 'damping' in kwargs: self.damping = kwargs.pop('damping') else: self.damping = 0.0 if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) # Initialize session config = tf.ConfigProto() self.sess = tf.Session(config=config) K.set_session(self.sess) # Setup input self.input_placeholder, self.labels_placeholder = self.placeholder_inputs( ) self.num_train_examples = self.data_sets.train.labels.shape[0] self.num_test_examples = self.data_sets.test.labels.shape[0] # Setup inference and training # if self.keep_probs is not None: # self.keep_probs_placeholder = tf.placeholder(tf.float32, shape=(2)) # self.logits = self.inference(self.input_placeholder, self.keep_probs_placeholder) # elif hasattr(self, 'inference_needs_labels'): # self.logits = self.inference(self.input_placeholder, self.labels_placeholder) # else: self.logits = self.inference(self.input_placeholder) self.total_loss, self.loss_no_reg, self.indiv_loss_no_reg = self.loss( self.logits, self.labels_placeholder) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.learning_rate = tf.Variable(self.initial_learning_rate, name='learning_rate', trainable=False) #self.learning_rate_placeholder = tf.placeholder(tf.float32) #self.update_learning_rate_op = tf.assign(self.learning_rate, self.learning_rate_placeholder) self.train_op = self.get_train_op(self.total_loss, self.global_step, self.learning_rate) self.train_op = self.get_train_sgd_op(self.total_loss, self.global_step, self.learning_rate) self.accuracy_op = self.get_accuracy_op(self.logits, self.labels_placeholder) self.preds = self.predictions(self.logits) # Setup misc self.saver = tf.train.Saver() # Setup gradients and Hessians self.params = self.get_all_params() self.grad_total_loss_op = tf.gradients(self.total_loss, self.params) self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params) self.v_placeholder = [ tf.placeholder(tf.float32, shape=a.get_shape()) for a in self.params ] self.u_placeholder = [ tf.placeholder(tf.float32, shape=a.get_shape()) for a in self.params ] self.hessian_vector = hessian_vector_product(self.total_loss, self.params, self.v_placeholder) self.grad_loss_wrt_input_op = tf.gradients(self.total_loss, self.input_placeholder) # Because tf.gradients auto accumulates, we probably don't need the add_n (or even reduce_sum) self.influence_op = tf.add_n([ tf.reduce_sum(tf.multiply(a, array_ops.stop_gradient(b))) for a, b in zip(self.grad_total_loss_op, self.v_placeholder) ]) self.grad_influence_wrt_input_op = tf.gradients( self.influence_op, self.input_placeholder) self.checkpoint_file = os.path.join(self.train_dir, "%s-checkpoint" % self.model_name) self.all_train_feed_dict = self.fill_feed_dict_with_all_ex( self.data_sets.train) self.all_test_feed_dict = self.fill_feed_dict_with_all_ex( self.data_sets.test) init = tf.global_variables_initializer() self.sess.run(init) self.vec_to_list = self.get_vec_to_list_fn()
def softmax_cross_entropy( onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of shape `[batch_size]`, then the loss weights apply to each corresponding sample. If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes: new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes Note that `onehot_labels` and `logits` must have the same shape, e.g. `[batch_size, num_classes]`. The shape of `weights` must be broadcastable to loss, whose shape is decided by the shape of `logits`. In case the shape of `logits` is `[batch_size, num_classes]`, loss is a `Tensor` of shape `[batch_size]`. Args: onehot_labels: One-hot-encoded labels. logits: Logits outputs of the network. weights: Optional `Tensor` that is broadcastable to loss. label_smoothing: If greater than 0 then smooth the labels. scope: the scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss `Tensor` of the same type as `logits`. If `reduction` is `NONE`, this has shape `[batch_size]`; otherwise, it is scalar. Raises: ValueError: If the shape of `logits` doesn't match that of `onehot_labels` or if the shape of `weights` is invalid or if `weights` is None. Also if `onehot_labels` or `logits` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if onehot_labels is None: raise ValueError("onehot_labels must not be None.") if logits is None: raise ValueError("logits must not be None.") with ops.name_scope(scope, "softmax_cross_entropy_loss", (logits, onehot_labels, weights)) as scope: logits = ops.convert_to_tensor(logits) onehot_labels = math_ops.cast(onehot_labels, logits.dtype) logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape()) if label_smoothing > 0: num_classes = math_ops.cast( array_ops.shape(onehot_labels)[-1], logits.dtype) smooth_positives = 1.0 - label_smoothing smooth_negatives = label_smoothing / num_classes onehot_labels = onehot_labels * smooth_positives + smooth_negatives onehot_labels = array_ops.stop_gradient( onehot_labels, name="labels_stop_gradient") losses = nn.softmax_cross_entropy_with_logits_v2( labels=onehot_labels, logits=logits, name="xentropy") return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def argmax(x): i = math_ops.argmax(x) return array_ops.stop_gradient(i)
def surrogate_loss(sample_losses, stochastic_tensors=None, name="SurrogateLoss"): """Surrogate loss for stochastic graphs. This function will call `loss_fn` on each `StochasticTensor` upstream of `sample_losses`, passing the losses that it influenced. Note that currently `surrogate_loss` does not work with `StochasticTensor`s instantiated in `while_loop`s or other control structures. Args: sample_losses: a list or tuple of final losses. Each loss should be per example in the batch (and possibly per sample); that is, it should have dimensionality of 1 or greater. All losses should have the same shape. stochastic_tensors: a list of `StochasticTensor`s to add loss terms for. If None, defaults to all `StochasticTensor`s in the graph upstream of the `Tensor`s in `sample_losses`. name: the name with which to prepend created ops. Returns: `Tensor` loss, which is the sum of `sample_losses` and the `loss_fn`s returned by the `StochasticTensor`s. Raises: TypeError: if `sample_losses` is not a list or tuple, or if its elements are not `Tensor`s. ValueError: if any loss in `sample_losses` does not have dimensionality 1 or greater. """ with ops.name_scope(name, values=sample_losses): if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims >= 1): raise ValueError( "loss must have dimensionality 1 or greater: %s" % loss) stoch_dependencies_map = _stochastic_dependencies_map( sample_losses, stochastic_tensors=stochastic_tensors) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return math_ops.add_n(sample_losses) # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses] loss_terms = sample_losses for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): dependent_losses = list(dependent_losses) logging.info("Losses influenced by StochasticTensor %s: [%s]", stoch_node.name, ", ".join([loss.name for loss in dependent_losses])) # Sum up the downstream losses for this ST influenced_loss = _add_n_or_sum(dependent_losses) # Compute surrogate loss term loss_term = stoch_node.loss( array_ops.stop_gradient(influenced_loss)) if loss_term is not None: loss_terms.append(loss_term) return _add_n_or_sum(loss_terms)
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled, resampling_temperature, partition_strategy): """A helper function for rank_sampled_softmax_loss. This computes, for each i in `sampled_values`, log(sum_j exp((w_i * x_j + b_i) / resampling_temperature)) where w_i, b_i are the weight and bias of the i-th class, respectively, and j ranges over the rows of `inputs`. For efficiency, we rearrange the computation to log(sum_j exp(w_i * (x_j / resampling_temperature))) + b_i / resampling_temperature. This translates to the following batched computation using tensorflow ops: reduce_logsumexp(matmul(embeddings, transpose(inputs / resampling_temperature))) + biases / resampling_temperature The computation of the first term is colocated with the embeddings using `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second term, not the bottleneck, is computed at the worker. Args: weights: From `rank_sampled_softmax_loss`. biases: From `rank_sampled_softmax_loss`. inputs: From `rank_sampled_softmax_loss`. sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. num_resampled: An `int`. This many values are selected from `sampled_values` using the adaptive resampling algorithm. The caller must ensure that `num_resampled` is less than the size of `sampled_values`. resampling_temperature: A scalar `Tensor` with the temperature parameter for the adaptive resampling algorithm. partition_strategy: From `rank_sampled_softmax_loss`. Returns: A tuple of (`resampled_candidates`, `true_expected_count`, `resampled_expected_count`), similar to `sampled_values` but sampled down to `num_resampled` values. """ # This code supports passing a Tensor for num_resampled, but since it is only # called with an int, that's what we specify in the arg list. If this # function is ever externalized, we should change the doc to support Tensor. sampled, true_expected_count, sampled_expected_count = sampled_values sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64) true_expected_count = array_ops.stop_gradient(true_expected_count) sampled_expected_count = array_ops.stop_gradient(sampled_expected_count) reweighted_inputs = inputs / resampling_temperature def logsumexp_logit(embeddings): return math_ops.reduce_logsumexp(math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True), axis=1, keep_dims=False) # Calling this protected form of embedding_lookup allows co-locating # the logsumexp computation with the partitioned weights, which yields # a large speedup in practice. sampled_logits = embedding_ops._embedding_lookup_and_transform( # pylint: disable=protected-access weights, sampled, partition_strategy, transform_fn=logsumexp_logit) sampled_b = array_ops.reshape( embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1]) sampled_logits += sampled_b / resampling_temperature _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False) resampled = array_ops.gather(sampled, indices=resampled_indices) resampled_expected_count = array_ops.gather(sampled_expected_count, indices=resampled_indices) return resampled, true_expected_count, resampled_expected_count
def call(self, inputs, training=None, use_moving_statistics=True): """ :param inputs: input features :param training: boolean or boolean Tensor (with shape []) which determines the current training phase :param use_moving_statistics: boolean or boolean Tensor (with shape []) which selects statistics to use when training==True (or the Tensor value) statistics (mean and variance) are from the inputs ! when training==False, if use_moving_statistics==True -> feed forward with moving statistics (updated with operations defined in GraphKeys.UPDATE_OPS) else (use_moving_statistics==False -> feed forward with raw statistics (updated with operations from collections 'UPDATE_BN_OPS' 'RESET_BN_OPS' contains operations to reset these vaiables between inferences. """ in_eager_mode = context.executing_eagerly() if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm( inputs, training=training, use_moving_statistics=use_moving_statistics) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching outputs = undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.get_shape() ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.get_shape()) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = tf_utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = tf_utils.smart_cond( training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = tf_utils.smart_cond( training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len( self.axis) > 1 # mean and variance of the current batch mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims) mean = tf_utils.smart_cond( training, lambda: mean, lambda: tf_utils.smart_cond(use_moving_statistics, lambda: self .moving_mean, lambda: self.mean)) variance = tf_utils.smart_cond( training, lambda: variance, lambda: tf_utils.smart_cond( use_moving_statistics, lambda: self.moving_variance, lambda: self.variance)) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) else: new_mean, new_variance = mean, variance if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True) new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True) def _do_update(var, value): if in_eager_mode and not self.trainable: return return self._assign_moving_average(var, value, self.momentum) moving_mean_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: self.moving_mean) moving_variance_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: self.moving_variance) if not context.executing_eagerly(): self.add_update(moving_mean_update, inputs=True) self.add_update(moving_variance_update, inputs=True) mean_update = self._update_statistics(self.mean, mean, self.n_updates) variance_update = self._update_statistics(self.variance, variance, self.n_updates) with ops.control_dependencies([mean_update, variance_update]): # update n_updates only after updating self.mean and self.variance update_n_updates = state_ops.assign_add(self.n_updates, 1.) ops.add_to_collection('UPDATE_BN_OPS', update_n_updates) reset_mean = state_ops.assign(self.mean, array_ops.zeros_like(self.mean)) reset_variance = state_ops.assign( self.variance, array_ops.zeros_like(self.variance)) reset_n_updates = state_ops.assign(self.n_updates, 0.) with ops.control_dependencies( [reset_mean, reset_variance, reset_n_updates]): reset_bn = gen_control_flow_ops.no_op("ResetBatchNormStats") ops.add_to_collection('RESET_OPS', reset_bn) else: # training == False mean = tf_utils.smart_cond(use_moving_statistics, lambda: self.moving_mean, lambda: self.mean) variance = tf_utils.smart_cond(use_moving_statistics, lambda: self.moving_variance, lambda: self.variance) mean = math_ops.cast(mean, inputs.dtype) variance = math_ops.cast(variance, inputs.dtype) if offset is not None: offset = math_ops.cast(offset, inputs.dtype) outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: outputs = undo_virtual_batching(outputs) return outputs
def dense_to_csr_sparse_matrix(dense): dense_t = ops.convert_to_tensor(dense) locs = array_ops.stop_gradient(array_ops.where(math_ops.abs(dense_t) > 0)) return sparse_csr_matrix_ops.dense_to_csr_sparse_matrix(dense_t, locs)
def call(self, inputs, training=False): if self.fused: return self._fused_batch_norm(inputs, training=training) # First, compute the axes along which to reduce the mean / variance, # as well as the broadcast shape to be used for all parameters. input_shape = inputs.get_shape() ndim = len(input_shape) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis].value # Determines whether broadcasting is needed. needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1]) scale, offset = self.gamma, self.beta # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if training_value is not False: # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. mean, variance = nn.moments(inputs, reduction_axes) mean = _smart_select(training, lambda: mean, lambda: self.moving_mean) variance = _smart_select(training, lambda: variance, lambda: self.moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. scale = array_ops.stop_gradient(r, name='renorm_r') offset = array_ops.stop_gradient(d, name='renorm_d') if self.gamma is not None: scale *= self.gamma offset *= self.gamma if self.beta is not None: offset += self.beta else: new_mean, new_variance = mean, variance # Update moving averages when training, and prevent updates otherwise. decay = _smart_select(training, lambda: self.momentum, lambda: 1.) mean_update = moving_averages.assign_moving_average( self.moving_mean, new_mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, new_variance, decay, zero_debias=False) self.add_update(mean_update, inputs=inputs) self.add_update(variance_update, inputs=inputs) else: mean, variance = self.moving_mean, self.moving_variance def _broadcast(v): if needs_broadcasting and v is not None: # In this case we must explicitly broadcast all parameters. return array_ops.reshape(v, broadcast_shape) return v return nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), _broadcast(offset), _broadcast(scale), self.epsilon)
def testStopGradient(self): with ops.Graph().as_default(): inp = constant(1.0, shape=[100, 32], name="in") out = array_ops.stop_gradient(inp) igrad = gradients.gradients(out, inp)[0] assert igrad is None
def _pairwise_comparison(sorted_labels, sorted_logits, sorted_weights, lambda_weight=None): r"""Returns pairwise comparison `Tensor`s. Given a list of n items, the labels of graded relevance l_i and the logits s_i, we sort the items in a list based on s_i and obtain ranks r_i. We form n^2 pairs of items. For each pair, we have the following: / | 1 if l_i > l_j * `pairwise_labels` = | | 0 if l_i <= l_j \ * `pairwise_logits` = s_i - s_j / | 0 if l_i <= l_j, * `pairwise_weights` = | |l_i - l_j| if lambda_weight is None, | lambda_weight otherwise. \ The `sorted_weights` is item-wise and is applied non-symmetrically to update pairwise_weights as pairwise_weights(i, j) = w_i * pairwise_weights(i, j). This effectively applies to all pairs with l_i > l_j. Note that it is actually symmetric when `sorted_weights` are constant per list, i.e., listwise weights. Args: sorted_labels: A `Tensor` with shape [batch_size, list_size] of labels sorted. sorted_logits: A `Tensor` with shape [batch_size, list_size] of logits sorted. sorted_weights: A `Tensor` with shape [batch_size, list_size] of item-wise weights sorted. lambda_weight: A `_LambdaWeight` object. Returns: A tuple of (pairwise_labels, pairwise_logits, pairwise_weights) with each having the shape [batch_size, list_size, list_size]. """ # Compute the difference for all pairs in a list. The output is a Tensor with # shape [batch_size, list_size, list_size] where the entry [-1, i, j] stores # the information for pair (i, j). pairwise_label_diff = array_ops.expand_dims( sorted_labels, 2) - array_ops.expand_dims(sorted_labels, 1) pairwise_logits = array_ops.expand_dims( sorted_logits, 2) - array_ops.expand_dims(sorted_logits, 1) pairwise_labels = math_ops.to_float( math_ops.greater(pairwise_label_diff, 0)) is_label_valid = utils.is_label_valid(sorted_labels) valid_pair = math_ops.logical_and(array_ops.expand_dims(is_label_valid, 2), array_ops.expand_dims(is_label_valid, 1)) # Only keep the case when l_i > l_j. pairwise_weights = pairwise_labels * math_ops.to_float(valid_pair) # Apply the item-wise weights along l_i. pairwise_weights *= array_ops.expand_dims(sorted_weights, 2) if lambda_weight is not None: pairwise_weights *= lambda_weight.pair_weights(sorted_labels) else: pairwise_weights *= math_ops.abs(pairwise_label_diff) pairwise_weights = array_ops.stop_gradient(pairwise_weights, name='weights_stop_gradient') return pairwise_labels, pairwise_logits, pairwise_weights
def __init__(self, rnn_cell, seq_inputs, initial_state): assert initial_state is not None # TODO(drpng): Dtype needs to be configurable. input_dtypes = [seq_inputs.dtype ] + _GetDTypesFromStructure(initial_state) # See _index. like_inputs_t = nest.map_structure( lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)), seq_inputs) input_structure = (like_inputs_t, initial_state) @function.Defun(*input_dtypes) def FlatCellStep(*flat_inputs): """The flattened version of `rnn_cell`.""" inputs_t, state0 = nest.pack_sequence_as(input_structure, flat_inputs) _SetShapeFromTemplate(state0, initial_state) _SetShapeFromTemplate(inputs_t, like_inputs_t) outputs_t, state1 = rnn_cell(inputs_t, state0) state_list = nest.flatten(state1) self._output_shape = outputs_t.shape if outputs_t in state_list: output_index_in_state = state_list.index(outputs_t) else: output_index_in_state = None if output_index_in_state is None: self._prepend_output = True self._output_state_idx = 0 return [outputs_t] + state_list else: self._output_state_idx = output_index_in_state self._prepend_output = False # To save memory, we don't store return the output separately # from the state list, since we know it's the same. return state_list def _ToPureFunction(func): # NOTE: This forces the creating of the function. if func.captured_inputs: pure_func = copy.copy(func) # pylint: disable=protected-access pure_func._extra_inputs = [] return pure_func return func pure_flat_cell_step = _ToPureFunction(FlatCellStep) def CellStep(theta, extended_state0, inputs_t): """Performs one time steps on structured inputs. The purpose of this function is to turn the parameters into flattened versions, and to resolve the parameter order difference between `Recurrent` and `RNNCell`. In the event the cell returns a transformed output that is not aliased within its state, the `extended_state0` also contains the output as its first element. Args: theta: Weights required for the computation. A structure of tensors. extended_state0: the state0, and possibly the output at the previous time step. A structure of tensors. inputs_t: the inputs at time t. Returns: A pair of the next state (inclusive of the output), and an empty list (unused `extras`). The next state is congruent to state0. """ extended_state0_flat = nest.flatten(extended_state0) state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat) full_inputs = [inputs_t] + state0_flat + theta # Note that the thetas are additional inputs appeneded as extra # parameters. cell_out = pure_flat_cell_step(*full_inputs) return cell_out, [] self._cell_step = CellStep self._theta = FlatCellStep.captured_inputs self._zero_state = rnn_cell.zero_state self._state_template = initial_state self._output_size = rnn_cell.output_size
def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None): with ops.name_scope(name, "clip_by_value_preserve_grad",[x, clip_value_min, clip_value_max]): clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max) return x + array_ops.stop_gradient(clip_x - x)
def call(self, inputs, training=False): in_eager_mode = context.executing_eagerly() if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching return undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.get_shape() ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.get_shape()) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = utils.smart_cond(training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = utils.smart_cond(training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1 mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = utils.smart_cond(training, lambda: mean, lambda: moving_mean) variance = utils.smart_cond(training, lambda: variance, lambda: moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) else: new_mean, new_variance = mean, variance if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(new_mean, axis=1, keep_dims=True) new_variance = math_ops.reduce_mean(new_variance, axis=1, keep_dims=True) def _do_update(var, value): if in_eager_mode and not self.trainable: return return moving_averages.assign_moving_average( var, value, self.momentum, zero_debias=False) mean_update = utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: self.moving_mean) variance_update = utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: self.moving_variance) if not context.executing_eagerly(): self.add_update(mean_update, inputs=inputs) self.add_update(variance_update, inputs=inputs) else: mean, variance = self.moving_mean, self.moving_variance outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: return undo_virtual_batching(outputs) return outputs
def call(self, inputs, training=False): if self.num_virtual_batches > 1: # Virtual batches (aka ghost batches) can be simulated by using some # reshape/transpose tricks on top of base batch normalization. original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [-1, self.num_virtual_batches ] + original_shape[1:] # Will cause errors if num_virtual_batches does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) ndims = len(expanded_shape) if self.axis < 0: axis = ndims + self.axis else: axis = self.axis + 1 # Account for the added dimension # Permute the num_virtual_batch dimension (dim 1) to be adjacent to axis # TODO(b/66257056): when multi-axis batch normalization is implemented, # this permutation trick and the combined_dim reshape are no longer # necessary and can be reworked to simply use broadcasting. permutation = ([0] + list(range(2, axis)) + [1, axis] + list(range(axis + 1, ndims))) inverse_permutation = [ x[1] for x in sorted(zip(permutation, range(ndims))) ] inputs = array_ops.transpose(inputs, perm=permutation) # Combine the axis and num_virtual_batch dimension in order to take # advantage of fused batch normalization combined_dim = expanded_shape[1] * expanded_shape[axis] perm_shape = [-1] + inputs.shape.as_list()[1:] combined_shape = (perm_shape[:axis - 1] + [combined_dim] + perm_shape[axis + 1:]) inputs = array_ops.reshape(inputs, combined_shape) # After the above reshape, the batch norm axis is the original self.axis # Undoes the reshaping and transposing tricks done above def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, perm_shape) outputs = array_ops.transpose(outputs, perm=inverse_permutation) outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.num_virtual_batches > 1: return undo_virtual_batching(outputs) return outputs # First, compute the axes along which to reduce the mean / variance, # as well as the broadcast shape to be used for all parameters. input_shape = inputs.get_shape() ndim = len(input_shape) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis].value # Determines whether broadcasting is needed. needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1]) scale, offset = self.gamma, self.beta # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if training_value is not False: # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. mean, variance = nn.moments(inputs, reduction_axes) mean = _smart_select(training, lambda: mean, lambda: self.moving_mean) variance = _smart_select(training, lambda: variance, lambda: self.moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. scale = array_ops.stop_gradient(r, name='renorm_r') offset = array_ops.stop_gradient(d, name='renorm_d') if self.gamma is not None: scale *= self.gamma offset *= self.gamma if self.beta is not None: offset += self.beta else: new_mean, new_variance = mean, variance # Update moving averages when training, and prevent updates otherwise. decay = _smart_select(training, lambda: self.momentum, lambda: 1.) mean_update = moving_averages.assign_moving_average( self.moving_mean, new_mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, new_variance, decay, zero_debias=False) if context.in_graph_mode(): self.add_update(mean_update, inputs=inputs) self.add_update(variance_update, inputs=inputs) else: mean, variance = self.moving_mean, self.moving_variance def _broadcast(v): if needs_broadcasting and v is not None: # In this case we must explicitly broadcast all parameters. return array_ops.reshape(v, broadcast_shape) return v outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), _broadcast(offset), _broadcast(scale), self.epsilon) if self.num_virtual_batches > 1: return undo_virtual_batching(outputs) return outputs
# INIT # ************************************* tf.global_variables_initializer().run() # ************************************* # NCE STAGING # ************************************* noise_sampler = candidate_sampling_ops.uniform_candidate_sampler( true_classes=input_labels.tensor, num_true=1, num_sampled=nce_samples, unique=True, range_max=vocab_size, seed=None) sampled, true_expected_count, sampled_expected_count = ( array_ops.stop_gradient(s) for s in noise_sampler) print("adaptive sample: ", tf.shape(noise_logits.tensor).eval()) print("[noise sample shape] {}".format(tf.shape(sampled).eval())) labels_flat = array_ops.reshape(input_labels.tensor, [-1]) true_ris = tx.gather_sparse(sp_tensor=ri_tensor, ids=labels_flat) noise_ris = tx.gather_sparse(sp_tensor=ri_tensor, ids=sampled) print("----") print("[true_ri shape] {}".format(tf.shape(true_ris).eval())) print("[noise_ri shape] {}".format(tf.shape(noise_ris).eval())) print("----")
def evaluate_on_sample(self, seed=None): if seed is None: seed = self._default_seed # We treat the targets as "constant". It's only the inputs that get # "back-propped" through. return self._evaluate(array_ops.stop_gradient(self.sample(seed)))
def testStopGradient(self): input_ = np.random.rand(4, 7) tf_val = array_ops.stop_gradient(input_) c_val = tensor_util.constant_value(tf_val) self.assertAllEqual(input_, c_val)
def embedFunc(y): emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def combine_adversarial_loss(main_loss, adversarial_loss, weight_factor=None, gradient_ratio=None, gradient_ratio_epsilon=1e-6, variables=None, scalar_summaries=True, gradient_summaries=True, scope=None): """Utility to combine main and adversarial losses. This utility combines the main and adversarial losses in one of two ways. 1) Fixed coefficient on adversarial loss. Use `weight_factor` in this case. 2) Fixed ratio of gradients. Use `gradient_ratio` in this case. This is often used to make sure both losses affect weights roughly equally, as in https://arxiv.org/pdf/1705.05823. One can optionally also visualize the scalar and gradient behavior of the losses. Args: main_loss: A floating scalar Tensor indicating the main loss. adversarial_loss: A floating scalar Tensor indication the adversarial loss. weight_factor: If not `None`, the coefficient by which to multiply the adversarial loss. Exactly one of this and `gradient_ratio` must be non-None. gradient_ratio: If not `None`, the ratio of the magnitude of the gradients. Specifically, gradient_ratio = grad_mag(main_loss) / grad_mag(adversarial_loss) Exactly one of this and `weight_factor` must be non-None. gradient_ratio_epsilon: An epsilon to add to the adversarial loss coefficient denominator, to avoid division-by-zero. variables: List of variables to calculate gradients with respect to. If not present, defaults to all trainable variables. scalar_summaries: Create scalar summaries of losses. gradient_summaries: Create gradient summaries of losses. scope: Optional name scope. Returns: A floating scalar Tensor indicating the desired combined loss. Raises: ValueError: Malformed input. """ _validate_args([main_loss, adversarial_loss], weight_factor, gradient_ratio) if variables is None: variables = contrib_variables_lib.get_trainable_variables() with ops.name_scope(scope, 'adversarial_loss', values=[main_loss, adversarial_loss]): # Compute gradients if we will need them. if gradient_summaries or gradient_ratio is not None: main_loss_grad_mag = _numerically_stable_global_norm( gradients_impl.gradients(main_loss, variables)) adv_loss_grad_mag = _numerically_stable_global_norm( gradients_impl.gradients(adversarial_loss, variables)) # Add summaries, if applicable. if scalar_summaries: summary.scalar('main_loss', main_loss) summary.scalar('adversarial_loss', adversarial_loss) if gradient_summaries: summary.scalar('main_loss_gradients', main_loss_grad_mag) summary.scalar('adversarial_loss_gradients', adv_loss_grad_mag) # Combine losses in the appropriate way. # If `weight_factor` is always `0`, avoid computing the adversarial loss # tensor entirely. if _used_weight((weight_factor, gradient_ratio)) == 0: final_loss = main_loss elif weight_factor is not None: final_loss = ( main_loss + array_ops.stop_gradient(weight_factor) * adversarial_loss) elif gradient_ratio is not None: grad_mag_ratio = main_loss_grad_mag / (adv_loss_grad_mag + gradient_ratio_epsilon) adv_coeff = grad_mag_ratio / gradient_ratio summary.scalar('adversarial_coefficient', adv_coeff) final_loss = ( main_loss + array_ops.stop_gradient(adv_coeff) * adversarial_loss) return final_loss