コード例 #1
0
  def _logits_cumulative(self, inputs, stop_gradient):
    """Evaluate logits of the cumulative densities.

    Args:
      inputs: The values at which to evaluate the cumulative densities, expected
        to be a `Tensor` of shape `(channels, 1, batch)`.
      stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so
        that the gradient of the output with respect to the density model
        parameters is disconnected (the gradient with respect to `inputs` is
        left untouched).

    Returns:
      A `Tensor` of the same shape as `inputs`, containing the logits of the
      cumulative densities evaluated at the given inputs.
    """
    logits = inputs

    for i in range(len(self.filters) + 1):
      matrix = self._matrices[i]
      if stop_gradient:
        matrix = array_ops.stop_gradient(matrix)
      logits = math_ops.matmul(matrix, logits)

      bias = self._biases[i]
      if stop_gradient:
        bias = array_ops.stop_gradient(bias)
      logits += bias

      if i < len(self._factors):
        factor = self._factors[i]
        if stop_gradient:
          factor = array_ops.stop_gradient(factor)
        logits += factor * math_ops.tanh(logits)

    return logits
コード例 #2
0
ファイル: util_test.py プロジェクト: Wajih-O/tensorflow
 def _run_test(self, x_, use_deferred_shape=False, **kwargs):
   x_ = np.asarray(x_)
   with self.cached_session() as sess:
     static_shape = None if use_deferred_shape else x_.shape
     x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
     # Add `zeros_like(x)` such that x's value and gradient are identical. We
     # do this so we can ensure each gradient value is mapped to the right
     # gradient location.  (Not doing this means the gradient wrt `x` is simple
     # `ones_like(x)`.)
     # Note:
     #   zeros_like_x_pl == zeros_like(x_pl)
     #   gradient(zeros_like_x_pl, x_pl) == x_pl - 1
     zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.)
                        - array_ops.stop_gradient(x_pl * (x_pl - 1.)))
     x = x_pl + zeros_like_x_pl
     actual = du.fill_triangular(x, **kwargs)
     grad_actual = gradients_impl.gradients(actual, x_pl)[0]
     [actual_, grad_actual_] = sess.run([actual, grad_actual],
                                        feed_dict={x_pl: x_})
   expected = self._fill_triangular(x_, **kwargs)
   if use_deferred_shape:
     self.assertEqual(None, actual.shape)
   else:
     self.assertAllEqual(expected.shape, actual.shape)
   self.assertAllClose(expected, actual_, rtol=1e-8, atol=1e-9)
   self.assertAllClose(x_, grad_actual_, rtol=1e-8, atol=1e-9)
コード例 #3
0
  def _create_value(self):
    """Create the value Tensor based on the value type, store as self._value."""

    if isinstance(self._value_type, MeanValue):
      value_tensor = self._dist.mean()
    elif isinstance(self._value_type, SampleValue):
      value_tensor = self._dist.sample(self._value_type.shape)
    else:
      raise TypeError("Unrecognized Distribution Value Type: %s",
                      self._value_type)

    if self._value_type.stop_gradient:
      # stop_gradient is being enforced by the value type
      return array_ops.stop_gradient(value_tensor)

    if isinstance(self._value_type, MeanValue):
      return value_tensor  # Using pathwise-derivative for this one.
    if self._dist.is_continuous and (
        self._dist.reparameterization_type
        is distribution.FULLY_REPARAMETERIZED):
      return value_tensor  # Using pathwise-derivative for this one.
    else:
      # Will have to perform some variant of score function
      # estimation.  Call stop_gradient on the sampler just in case we
      # may accidentally leak some gradient from it.
      return array_ops.stop_gradient(value_tensor)
コード例 #4
0
def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
  """Estimates the largest singular value in the weight tensor.

  Args:
    w_tensor: The weight matrix whose spectral norm should be computed.
    power_iteration_rounds: The number of iterations of the power method to
      perform. A higher number yields a better approximation.
    name: An optional scope name.

  Returns:
    The largest singular value (the spectral norm) of w.
  """
  with variable_scope.variable_scope(name, 'spectral_norm'):
    # The paper says to flatten convnet kernel weights from
    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
    # (KH * KW * C_in, C_out), and similarly for other layers that put output
    # channels as last dimension.
    # n.b. this means that w here is equivalent to w.T in the paper.
    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))

    # Persisted approximation of first left singular vector of matrix `w`.
    u_var = variable_scope.get_variable(
        _PERSISTED_U_VARIABLE_SUFFIX,
        shape=(w.shape[0], 1),
        dtype=w.dtype,
        initializer=init_ops.random_normal_initializer(),
        trainable=False)
    u = u_var

    # Use power iteration method to approximate spectral norm.
    for _ in range(power_iteration_rounds):
      # `v` approximates the first right singular vector of matrix `w`.
      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
      u = nn.l2_normalize(math_ops.matmul(w, v))

    # Update persisted approximation.
    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
      u = array_ops.identity(u)

    u = array_ops.stop_gradient(u)
    v = array_ops.stop_gradient(v)

    # Largest singular value of `w`.
    spectral_norm = math_ops.matmul(
        math_ops.matmul(array_ops.transpose(u), w), v)
    spectral_norm.shape.assert_is_fully_defined()
    spectral_norm.shape.assert_is_compatible_with([1, 1])

    return spectral_norm[0][0]
コード例 #5
0
ファイル: gradients_test.py プロジェクト: didukhle/tensorflow
    def _MakeGraph(rng, stop_gradients=()):
      def _FunctionOf(xs, k=3):
        return ops.convert_to_tensor(
            sum(math_ops.matmul(rng.rand(k, k), x) for x in xs)
            + rng.rand(k, k))

      a = _FunctionOf([])
      if "a" in stop_gradients: a = array_ops.stop_gradient(a)
      b = _FunctionOf([a])
      if "b" in stop_gradients: b = array_ops.stop_gradient(b)
      c = _FunctionOf([a, b])
      if "c" in stop_gradients: c = array_ops.stop_gradient(c)
      d = _FunctionOf([b, c])
      if "d" in stop_gradients: d = array_ops.stop_gradient(d)
      return dict(a=a, b=b, c=c, d=d)
コード例 #6
0
def _statistics(x, axes):
  """Calculate the mean and mean square of `x`.

  Modified from the implementation of `tf.nn.moments`.

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.

  Returns:
    Two `Tensor` objects: `mean` and `square mean`.
  """
  # The dynamic range of fp16 is too limited to support the collection of
  # sufficient statistics. As a workaround we simply perform the operations
  # on 32-bit floats before converting the mean and variance back to fp16
  y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x

  # Compute true mean while keeping the dims for proper broadcasting.
  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))

  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
  mean = shifted_mean + shift
  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)

  mean = array_ops.squeeze(mean, axes)
  mean_squared = array_ops.squeeze(mean_squared, axes)
  if x.dtype == dtypes.float16:
    return (math_ops.cast(mean, dtypes.float16),
            math_ops.cast(mean_squared, dtypes.float16))
  else:
    return (mean, mean_squared)
コード例 #7
0
def surrogate_loss(sample_losses,
                   stochastic_tensors=None,
                   name="SurrogateLoss"):
  """Surrogate loss for stochastic graphs.

  This function will call `loss_fn` on each `StochasticTensor`
  upstream of `sample_losses`, passing the losses that it influenced.

  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
  instantiated in `while_loop`s or other control structures.

  Args:
    sample_losses: a list or tuple of final losses. Each loss should be per
      example in the batch (and possibly per sample); that is, it should have
      dimensionality of 1 or greater. All losses should have the same shape.
    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
      If None, defaults to all `StochasticTensor`s in the graph upstream of
      the `Tensor`s in `sample_losses`.
    name: the name with which to prepend created ops.

  Returns:
    `Tensor` loss, which is the sum of `sample_losses` and the
    `loss_fn`s returned by the `StochasticTensor`s.

  Raises:
    TypeError: if `sample_losses` is not a list or tuple, or if its elements
      are not `Tensor`s.
    ValueError: if any loss in `sample_losses` does not have dimensionality 1
      or greater.
  """
  with ops.op_scope(sample_losses, name):
    fixed_losses = []
    if not isinstance(sample_losses, (list, tuple)):
      raise TypeError("sample_losses must be a list or tuple")
    for loss in sample_losses:
      if not isinstance(loss, ops.Tensor):
        raise TypeError("loss is not a Tensor: %s" % loss)
      ndims = loss.get_shape().ndims
      if not (ndims is not None and ndims >= 1):
        raise ValueError("loss must have dimensionality 1 or greater: %s" %
                         loss)
      fixed_losses.append(array_ops.stop_gradient(loss))

    stoch_dependencies_map = _stochastic_dependencies_map(
        fixed_losses, stochastic_tensors=stochastic_tensors)
    if not stoch_dependencies_map:
      logging.warn(
          "No collection of Stochastic Tensors found for current graph.")
      return math_ops.add_n(sample_losses)

    # Iterate through all of the stochastic dependencies, adding
    # surrogate terms where necessary.
    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
    loss_terms = sample_losses
    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
      loss_term = stoch_node.loss(list(dependent_losses))
      if loss_term is not None:
        loss_terms.append(loss_term)

    return math_ops.add_n(loss_terms)
コード例 #8
0
def score_function(stochastic_tensor, value, loss, baseline=None,
                   name="ScoreFunction"):
  """Score function estimator.

  Computes the integrand of the score function with a baseline:
  `p.log_prob(value) * (loss - baseline)`.

  It will add a `stop_gradient` to the advantage `(loss - baseline)`.

  Args:
    stochastic_tensor: `StochasticTensor` p(x).
    value: `Tensor` x. Samples from p(x).
    loss: `Tensor`.
    baseline: `Tensor` broadcastable to `loss`.
    name: name to prepend ops with.

  Returns:
    `Tensor` `p.log_prob(x) * (loss - b)`. Taking the gradient yields the score
    function estimator.
  """
  with ops.name_scope(name, values=[value, loss, baseline]):
    value = ops.convert_to_tensor(value)
    loss = ops.convert_to_tensor(loss)
    if baseline is not None:
      baseline = ops.convert_to_tensor(baseline)
      advantage = loss - baseline
    else:
      advantage = loss

    advantage = array_ops.stop_gradient(advantage)
    return stochastic_tensor.distribution.log_prob(value) * advantage
コード例 #9
0
    def _tree_train_op_fn(loss):
      """Returns the op to optimize the loss."""
      if dnn_to_tree_distillation_param:
        loss_weight, loss_fn = dnn_to_tree_distillation_param
        weight_tensor = head_lib._weight_tensor(  # pylint: disable=protected-access
            features, head.weight_column_name)
        dnn_logits_fixed = array_ops.stop_gradient(dnn_logits)

        if loss_fn is None:
          # we create the loss_fn similar to the head loss_fn for
          # multi_class_head used previously as the default one.
          n_classes = 2 if head.logits_dimension == 1 else head.logits_dimension
          loss_fn = distillation_loss.create_dnn_to_tree_cross_entropy_loss_fn(
              n_classes)

        dnn_to_tree_distillation_loss = loss_weight * loss_fn(
            dnn_logits_fixed, tree_logits, weight_tensor)
        summary.scalar("dnn_to_tree_distillation_loss",
                       dnn_to_tree_distillation_loss)
        loss += dnn_to_tree_distillation_loss

      update_op = gbdt_model.train(loss, predictions_dict, labels)
      with ops.control_dependencies(
          [update_op]), (ops.colocate_with(global_step)):
        update_op = state_ops.assign_add(global_step, 1).op
        return update_op
コード例 #10
0
def additional_score_function_losses(sample_losses, name=None):
  with ops.op_scope(sample_losses, name, "SampleLosses"):
    fixed_losses = []
    if not isinstance(sample_losses, (list, tuple)):
      raise TypeError("sample_losses must be a list or tuple")
    for loss in sample_losses:
      if not isinstance(loss, ops.Tensor):
        raise TypeError("loss is not a Tensor: %s" % loss)
      ndims = loss.get_shape().ndims
      if not (ndims is not None and ndims <= 1):
        raise ValueError(
            "loss must be a scalar or batch-length vector loss: %s" % loss)
      fixed_losses.append(array_ops.stop_gradient(loss))

    stoch_dependencies_map = _stochastic_dependencies_map(fixed_losses)
    if not stoch_dependencies_map:
      logging.warn(
          "No collection of Stochastic Tensors found for current graph.")
      return []

    score_function_losses = []

    # Iterate through all of the stochastic dependencies, adding
    # surrogate terms where necessary.
    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
      score_function = stoch_node.score_function(list(dependent_losses))
      if score_function is not None:
        with ops.name_scope("ScoreFunction_%s" % stoch_node.name):
          score_function_losses.append(array_ops.identity(score_function))

    return score_function_losses
コード例 #11
0
ファイル: resample.py プロジェクト: AlbertXiebnu/tensorflow
def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
  """Given `inputs` tensors, stochastically resamples each at a given rate.

  For example, if the inputs are `[[a1, a2], [b1, b2]]` and the rates
  tensor contains `[3, 1]`, then the return value may look like `[[a1,
  a2, a1, a1], [b1, b2, b1, b1]]`. However, many other outputs are
  possible, since this is stochastic -- averaged over many repeated
  calls, each set of inputs should appear in the output `rate` times
  the number of invocations.

  Args:
    inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
    rates: A tensor of shape `[batch_size]` contiaining the resampling rates
       for each input.
    scope: Scope for the op.
    seed: Random seed to use.
    back_prop: Whether to allow back-propagation through this op.

  Returns:
    Selections from the input tensors.
  """
  with ops.name_scope(scope, default_name='resample_at_rate',
                      values=list(inputs) + [rates]):
    rates = ops.convert_to_tensor(rates, name='rates')
    # random_poisson does not support rates of size 0 (b/36076216)
    sample_counts = math_ops.cast(control_flow_ops.cond(
        array_ops.shape(rates)[0] > 0,
        lambda: random_ops.random_poisson(rates, (), rates.dtype, seed=seed),
        lambda: array_ops.zeros(shape=[0], dtype=rates.dtype)), dtypes.int32)
    sample_indices = _repeat_range(sample_counts)
    if not back_prop:
      sample_indices = array_ops.stop_gradient(sample_indices)
    return [array_ops.gather(x, sample_indices) for x in inputs]
コード例 #12
0
def _logspace_mean(log_values):
  """Evaluate `Log[E[values]]` in a stable manner.

  Args:
    log_values:  `Tensor` holding `Log[values]`.

  Returns:
    `Tensor` of same `dtype` as `log_values`, reduced across dim 0.
      `Log[Mean[values]]`.
  """
  # center = Max[Log[values]],  with stop-gradient
  # The center hopefully keep the exponentiated term small.  It is cancelled
  # from the final result, so putting stop gradient on it will not change the
  # final result.  We put stop gradient on to eliminate unnecessary computation.
  center = array_ops.stop_gradient(_sample_max(log_values))

  # centered_values = exp{Log[values] - E[Log[values]]}
  centered_values = math_ops.exp(log_values - center)

  # log_mean_of_values = Log[ E[centered_values] ] + center
  #                    = Log[ E[exp{log_values - E[log_values]}] ] + center
  #                    = Log[E[values]] - E[log_values] + center
  #                    = Log[E[values]]
  log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center

  return log_mean_of_values
コード例 #13
0
ファイル: nn_grad.py プロジェクト: AlbertXiebnu/tensorflow
def _AvgPoolGradGrad(op, grad):
  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
      grad,
      op.get_attr("ksize"),
      op.get_attr("strides"),
      op.get_attr("padding"),
      data_format=op.get_attr("data_format")))
コード例 #14
0
  def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
    if output_projection is not None:
      prev = nn_ops.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    # prev= prev.get_shape().with_rank(2)[1]

    probs  = tf.log(tf.nn.softmax(prev))

    if i > 1:

        probs = tf.reshape(probs + log_beam_probs[-1],
                               [-1, beam_size * num_symbols])

    best_probs, indices = tf.nn.top_k(probs, beam_size)
    indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
    best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

    symbols = indices % num_symbols # Which word in vocabulary.
    beam_parent = indices // num_symbols # Which hypothesis it came from.


    beam_symbols.append(symbols)
    beam_path.append(beam_parent)
    log_beam_probs.append(best_probs)

    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.

    emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    emb_prev  = tf.reshape(emb_prev,[beam_size,embedding_size])
    # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    if not update_embedding:
      emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev
コード例 #15
0
ファイル: seq2seq.py プロジェクト: maxkarlovitz/tensorflow
 def extract_argmax_and_embed(prev, _):
   """Loop_function that extracts the symbol from prev and embeds it."""
   if output_projection is not None:
     prev = nn_ops.xw_plus_b(
         prev, output_projection[0], output_projection[1])
   prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
   return embedding_ops.embedding_lookup(embedding, prev_symbol)
コード例 #16
0
def softmax_cross_entropy(
    onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.

  `weights` acts as a coefficient for the loss. If a scalar is provided,
  then the loss is simply scaled by the given value. If `weights` is a
  tensor of shape `[batch_size]`, then the loss weights apply to each
  corresponding sample.

  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
      new_onehot_labels = onehot_labels * (1 - label_smoothing)
                          + label_smoothing / num_classes

  Args:
    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
    logits: `[batch_size, num_classes]` logits outputs of the network .
    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
    label_smoothing: If greater than 0 then smooth the labels.
    scope: the scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
    `NONE`, this has shape `[batch_size]`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
      or if the shape of `weights` is invalid or if `weights` is None.  Also if
      `onehot_labels` or `logits` is None.
  """
  if onehot_labels is None:
    raise ValueError("onehot_labels must not be None.")
  if logits is None:
    raise ValueError("logits must not be None.")
  with ops.name_scope(scope, "softmax_cross_entropy_loss",
                      (logits, onehot_labels, weights)) as scope:
    logits = ops.convert_to_tensor(logits)
    onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
    logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())

    if label_smoothing > 0:
      num_classes = math_ops.cast(
          array_ops.shape(onehot_labels)[1], logits.dtype)
      smooth_positives = 1.0 - label_smoothing
      smooth_negatives = label_smoothing / num_classes
      onehot_labels = onehot_labels * smooth_positives + smooth_negatives

    onehot_labels = array_ops.stop_gradient(
        onehot_labels, name="labels_stop_gradient")
    losses = nn.softmax_cross_entropy_with_logits_v2(
        labels=onehot_labels, logits=logits, name="xentropy")


    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
コード例 #17
0
  def _create_value(self):
    """Create the value Tensor based on the value type, store as self._value."""

    if isinstance(self._value_type, MeanValue):
      value_tensor = self._dist.mean()
    elif isinstance(self._value_type, SampleValue):
      value_tensor = self._dist.sample(self._value_type.n)
    elif isinstance(self._value_type, SampleAndReshapeValue):
      if self._value_type.n == 1:
        value_tensor = array_ops.squeeze(self._dist.sample(1), [0])
      else:
        samples = self._dist.sample(self._value_type.n)
        samples_shape = array_ops.shape(samples)
        samples_static_shape = samples.get_shape()
        new_batch_size = samples_shape[0] * samples_shape[1]
        value_tensor = array_ops.reshape(
            samples, array_ops.concat(0, ([new_batch_size], samples_shape[2:])))
        if samples_static_shape.ndims is not None:
          # Update the static shape for shape inference purposes
          shape_list = samples_static_shape.as_list()
          new_shape = tensor_shape.vector(
              shape_list[0] * shape_list[1]
              if shape_list[0] is not None and shape_list[1] is not None
              else None)
          new_shape = new_shape.concatenate(samples_static_shape[2:])
          value_tensor.set_shape(new_shape)
    else:
      raise TypeError(
          "Unrecognized Distribution Value Type: %s", self._value_type)

    stop_gradient = self._value_type.stop_gradient

    if stop_gradient:
      # stop_gradient is being enforced by the value type
      return array_ops.stop_gradient(value_tensor)

    if isinstance(self._value_type, MeanValue):
      return value_tensor  # Using pathwise-derivative for this one.
    if (isinstance(self._dist, distributions.ContinuousDistribution)
        and self._dist.is_reparameterized):
      return value_tensor  # Using pathwise-derivative for this one.
    else:
      # Will have to perform some variant of score function
      # estimation.  Call stop_gradient on the sampler just in case we
      # may accidentally leak some gradient from it.
      return array_ops.stop_gradient(value_tensor)
コード例 #18
0
def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars,
                        g_side_input):
  """Backprop for 1 layer."""
  y1, y2 = ys
  grad_y1, grad_y2 = grad_ys

  # Reconstruct intermediates and inputs (x1, x2)
  # stop_gradients required on fn inputs to prevent infinite recursion into this
  # grad function on the calls to gradients.
  y1_stop = array_ops.stop_gradient(y1)
  g_side_input = [array_ops.stop_gradient(t) for t in g_side_input]
  gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop)

  x2 = y2 - gy1
  x2_stop = array_ops.stop_gradient(x2)
  f_side_input = [array_ops.stop_gradient(t) for t in f_side_input]
  fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop)

  x1 = y1 - fx2

  # Compute gradients wrt to inputs
  # dL/dy2 * dG(y1)/y1
  grad_gy1_y2 = gradients_impl.gradients(gy1, y1_stop, grad_y2)[0]
  grad_x1 = grad_y1 + grad_gy1_y2
  grad_x2 = (
      gradients_impl.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 +
      gradients_impl.gradients(fx2, x2_stop, grad_gy1_y2)[0])

  # Compute gradients wrt to vars and side inputs in f and g
  grads1 = gradients_impl.gradients(gy1, g_vars + g_side_input, grad_y2)
  grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):]
  grads2 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_y1)
  grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):]
  grads3 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_gy1_y2)
  grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):]
  grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2)

  grad_f_side = _acc_grads(grad_f_side1, grad_f_side2)

  # Put returns in a tuple to ensure a constant memory budget (i.e. don't want
  # the subsequent layer to start computing and consuming memory based on a
  # subset of these values).
  outputs = ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side),
             (grad_g_vars, grad_g_side))
  tupled = control_flow_ops.tuple(nest.flatten(outputs))
  return nest.pack_sequence_as(outputs, tupled)
コード例 #19
0
 def evaluate(self):
   """Evaluate the loss function on the targets."""
   if self.targets is not None:
     # We treat the targets as "constant".  It's only the inputs that get
     # "back-propped" through.
     return self._evaluate(array_ops.stop_gradient(self.targets))
   else:
     raise Exception("Cannot evaluate losses with unspecified targets.")
コード例 #20
0
ファイル: seq2seq.py プロジェクト: UFAL-DSG/tgen
def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
                scope=None):
  """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: RNNCell defining the cell function and size.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x cell.output_size] containing generated outputs.
    states: The state of each cell in each time-step. This is a list with
      length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
       states can be the same. They are different for LSTM cells though.)
  """
  with vs.variable_scope(scope or "rnn_decoder"):
    states = [initial_state]
    outputs = []
    prev = None
    for i in xrange(len(decoder_inputs)):
      inp = decoder_inputs[i]
      if loop_function is not None and prev is not None:
        with vs.variable_scope("loop_function", reuse=True):
          # We do not propagate gradients over the loop function.
          inp = array_ops.stop_gradient(loop_function(prev, i))
      if i > 0:
        vs.get_variable_scope().reuse_variables()
      output, new_state = cell(inp, states[-1])
      outputs.append(output)
      states.append(new_state)
      if loop_function is not None:
        prev = array_ops.stop_gradient(output)
  return outputs, states
コード例 #21
0
ファイル: seq2seq.py プロジェクト: rakshit-agrawal/tensorflow
 def loop_function(prev, _):
     if output_projection is not None:
         prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
     prev_symbol = math_ops.argmax(prev, 1)
     # Note that gradients will not propagate through the second parameter of
     # embedding_lookup.
     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
     if not update_embedding:
         emb_prev = array_ops.stop_gradient(emb_prev)
     return emb_prev
コード例 #22
0
ファイル: stochastic_graph.py プロジェクト: AntHar/tensorflow
def surrogate_losses(sample_losses, name="SurrogateLosses"):
  """Compute surrogate losses for StochasticTensors in the graph.

  This function will call `surrogate_loss` on each `StochasticTensor` in the
  graph and pass the losses in `sample_losses` that that `StochasticTensor`
  influenced.

  Note that currently `surrogate_losses` does not work with `StochasticTensor`s
  instantiated in `while_loop`s or other control structures.

  Args:
    sample_losses: a list or tuple of final losses. Each loss should be per
      example in the batch (and possibly per sample); that is, it should have
      dimensionality of 1 or greater. All losses should have the same shape.
    name: the name with which to prepend created ops.

  Returns:
    A list of surrogate losses.

  Raises:
    TypeError: if `sample_losses` is not a list or tuple, or if its elements
      are not `Tensor`s.
    ValueError: if any loss in `sample_losses` does not have dimensionality 1
      or greater.
  """
  with ops.op_scope(sample_losses, name):
    fixed_losses = []
    if not isinstance(sample_losses, (list, tuple)):
      raise TypeError("sample_losses must be a list or tuple")
    for loss in sample_losses:
      if not isinstance(loss, ops.Tensor):
        raise TypeError("loss is not a Tensor: %s" % loss)
      ndims = loss.get_shape().ndims
      if not (ndims is not None and ndims >= 1):
        raise ValueError("loss must have dimensionality 1 or greater: %s" %
                         loss)
      fixed_losses.append(array_ops.stop_gradient(loss))

    stoch_dependencies_map = _stochastic_dependencies_map(fixed_losses)
    if not stoch_dependencies_map:
      logging.warn(
          "No collection of Stochastic Tensors found for current graph.")
      return []

    surrogate_loss_losses = []

    # Iterate through all of the stochastic dependencies, adding
    # surrogate terms where necessary.
    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
      surrogate_loss = stoch_node.surrogate_loss(list(dependent_losses))
      if surrogate_loss is not None:
        with ops.name_scope("SurrogateLoss_%s" % stoch_node.name):
          surrogate_loss_losses.append(array_ops.identity(surrogate_loss))

    return surrogate_loss_losses
コード例 #23
0
ファイル: util_test.py プロジェクト: Wajih-O/tensorflow
  def _run_test(self, x_, use_deferred_shape=False, **kwargs):
    x_ = np.asarray(x_)
    with self.cached_session() as sess:
      static_shape = None if use_deferred_shape else x_.shape
      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
      zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.)
                         - array_ops.stop_gradient(x_pl * (x_pl - 1.)))
      x = x_pl + zeros_like_x_pl
      actual = du.fill_triangular(x, **kwargs)
      inverse_actual = du.fill_triangular_inverse(actual, **kwargs)

      inverse_actual_ = sess.run(
          inverse_actual,
          feed_dict={x_pl: x_})

    if use_deferred_shape:
      self.assertEqual(None, inverse_actual.shape)
    else:
      self.assertAllEqual(x_.shape, inverse_actual.shape)
    self.assertAllEqual(x_, inverse_actual_)
コード例 #24
0
 def testScanGradientWithPartStopGradient(self):
   a = variables.Variable(0.0, name="a")
   b = variables.Variable(0.0, name="b")
   elems = array_ops.zeros(5)
   l0, l1 = functional_ops.scan(
       lambda elem_, input_: (a, b), elems, initializer=(0., 0.))
   loss = l0 + array_ops.stop_gradient(l1)
   grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
   with self.test_session(use_gpu=True) as sess:
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(grad)
コード例 #25
0
ファイル: nn_impl.py プロジェクト: pcm17/tensorflow
def moments(x, axes, shift=None, name=None, keep_dims=False):
  """Calculate the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: for numerical stability, when shift=None, the true mean
  would be computed and used as shift.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` in which case the true mean of the data is
      used as shift. A shift close to the true mean provides the most
      numerically stable results.
    name: Name used to scope the operations that compute the moments.
    keep_dims: produce moments with the same dimensionality as the input.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  """
  with ops.name_scope(name, "moments", [x, axes, shift]):
    # The dynamic range of fp16 is too limited to support the collection of
    # sufficient statistics. As a workaround we simply perform the operations
    # on 32-bit floats before converting the mean and variance back to fp16
    y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
    if shift is None:
      # Compute true mean while keeping the dims for proper broadcasting.
      shift = array_ops.stop_gradient(
          math_ops.reduce_mean(y, axes, keep_dims=True))
    else:
      shift = math_ops.cast(shift, y.dtype)
    counts, m_ss, v_ss, shift = sufficient_statistics(
        y, axes, shift=shift, keep_dims=keep_dims, name=name)
    # Reshape shift as needed.
    shift = array_ops.reshape(shift, array_ops.shape(m_ss))
    shift.set_shape(m_ss.get_shape())
    with ops.control_dependencies([counts, m_ss, v_ss]):
      mean, variance = normalize_moments(counts, m_ss, v_ss, shift, name=name)
      if x.dtype == dtypes.float16:
        return (math_ops.cast(mean, dtypes.float16),
                math_ops.cast(variance, dtypes.float16))
      else:
        return (mean, variance)
コード例 #26
0
ファイル: seq2seq.py プロジェクト: Kwihyuk/Conversation_model
   def loop_function(prev,_):

      if output_projection is not None:
         prev = nn_ops.xw_plus_b ( prev, output_projection[0], output_projection[1] )
      
      tf_prev_symbol = batch_sample_with_temperature(prev)
      emb_prev = embedding_ops.embedding_lookup(embedding, tf_prev_symbol)

      if not update_embedding :
         emb_prev = array_ops.stop_gradient(emb_prev)
      return emb_prev
コード例 #27
0
def _hessian_vector_product(ys, xs, v):
  """Multiply the Hessian of `ys` wrt `xs` by `v`.

  This is an efficient construction that uses a backprop-like approach
  to compute the product between the Hessian and another vector. The
  Hessian is usually too large to be explicitly computed or even
  represented, but this method allows us to at least multiply by it
  for the same big-O cost as backprop.

  Implicit Hessian-vector products are the main practical, scalable way
  of using second derivatives with neural networks. They allow us to
  do things like construct Krylov subspaces and approximate conjugate
  gradient descent.

  Example: if `y` = 1/2 `x`^T A `x`, then `hessian_vector_product(y,
  x, v)` will return an expression that evaluates to the same values
  as (A + A.T) `v`.

  Args:
    ys: A scalar value, or a tensor or list of tensors to be summed to
        yield a scalar.
    xs: A list of tensors that we should construct the Hessian over.
    v: A list of tensors, with the same shapes as xs, that we want to
       multiply by the Hessian.

  Returns:
    A list of tensors (or if the list would be length 1, a single tensor)
    containing the product between the Hessian and `v`.

  Raises:
    ValueError: `xs` and `v` have different length.

  """

  # Validate the input
  length = len(xs)
  if len(v) != length:
    raise ValueError("xs and v must have the same length.")

  # First backprop
  grads = gradients(ys, xs)

  assert len(grads) == length
  elemwise_products = [
      math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
      for grad_elem, v_elem in zip(grads, v)
      if grad_elem is not None
  ]

  # Second backprop
  return gradients(elemwise_products, xs)
コード例 #28
0
  def evaluate_on_sample(self, seed=None):
    """Evaluates the log probability on a random sample.

    Args:
      seed: int or None. Random seed for this draw from the distribution.

    Returns:
      Log probability of sampled targets, summed across examples.
    """
    if seed is None:
      seed = self._default_seed
    # We treat the targets as "constant".  It's only the inputs that get
    # "back-propped" through.
    return self._evaluate(array_ops.stop_gradient(self.sample(seed)))
コード例 #29
0
ファイル: nn_grad.py プロジェクト: chdinh/tensorflow
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  if data_format == b"NHWC":
    keep_dims = False
    reduce_axis = [0, 1, 2]
  else:
    keep_dims = True
    reduce_axis = [0, 2, 3]
    shape = [1, array_ops.size(scale), 1, 1]
    scale = array_ops.reshape(scale, shape)
  mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
  mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
  var_x = math_ops.reduce_mean(
      math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
      reduce_axis,
      keep_dims=keep_dims)
  grad_y_offset = grad_y - mean_grad_y
  x_offset = x - mean_x
  mean = math_ops.reduce_mean(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
      grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
  grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  if data_format == b"NCHW":
    grad_scale = array_ops.squeeze(grad_scale)
  grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
  return grad_x, grad_scale, grad_offset
コード例 #30
0
def _force_data_dependency(first_compute, then_compute):
  """Force all of `then_compute` to depend on all of `first_compute`.

  Uses a dummy data dependency, which is useful when running on TPUs because
  XLA ignores control dependencies. Only supports float arguments.

  Args:
    first_compute: `list<Tensor>`. These will be made to run before the
      `Tensor`s `then_compute`.
    then_compute: `list<Tensor>`. These will run after all the `Tensor`s in
      `first_compute`.

  Returns:
    `list<Tensor>`, same length as `then_compute`.

  Raises:
    ValueError: if ranks are unknown or types are not floating.
  """

  def _first_element(x):
    if x.get_shape().ndims is None:
      raise ValueError("Rank of Tensor %s must be known" % x)
    ndims = x.get_shape().ndims
    begin = framework_ops.convert_to_tensor([0] * ndims, dtype=dtypes.int32)
    size = framework_ops.convert_to_tensor([1] * ndims, dtype=dtypes.int32)
    return array_ops.reshape(array_ops.slice(x, begin, size), [])

  first_compute_sum = math_ops.add_n(
      [_first_element(x) for x in first_compute if x is not None])
  dtype = first_compute_sum.dtype
  if not dtype.is_floating:
    raise ValueError("_force_data_dependency only supports floating dtypes.")
  epsilon = np.finfo(dtype.as_numpy_dtype).tiny
  zero = array_ops.stop_gradient(epsilon * first_compute_sum)

  return [
      array_ops.identity(x) + zero if x is not None else None
      for x in then_compute
  ]
コード例 #31
0
def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
    """Clips input while leaving gradient unaltered."""
    with ops.name_scope(name, "clip_by_value_preserve_grad",
                        [x, clip_value_min, clip_value_max]):
        clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
        return x + array_ops.stop_gradient(clip_x - x)
コード例 #32
0
ファイル: backprop_test.py プロジェクト: ashuven63/tf_audio
 def testStopGradient(self):
   grad = backprop.gradients_function(
       lambda x: array_ops.stop_gradient(math_ops.argmax(x)))
   self.assertAllEqual(grad([0.0])[0], None)
コード例 #33
0
ファイル: normalization.py プロジェクト: denizobi/tensorflow
  def call(self, inputs, training=None):
    if training is None:
      training = K.learning_phase()

    if self.virtual_batch_size is not None:
      # Virtual batches (aka ghost batches) can be simulated by reshaping the
      # Tensor and reusing the existing batch norm implementation
      original_shape = [-1] + inputs.shape.as_list()[1:]
      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

      # Will cause errors if virtual_batch_size does not divide the batch size
      inputs = array_ops.reshape(inputs, expanded_shape)

      def undo_virtual_batching(outputs):
        outputs = array_ops.reshape(outputs, original_shape)
        return outputs

    if self.fused:
      outputs = self._fused_batch_norm(inputs, training=training)
      if self.virtual_batch_size is not None:
        # Currently never reaches here since fused_batch_norm does not support
        # virtual batching
        outputs = undo_virtual_batching(outputs)
      return outputs

    # Compute the axes along which to reduce the mean / variance
    input_shape = inputs.shape
    ndims = len(input_shape)
    reduction_axes = [i for i in range(ndims) if i not in self.axis]
    if self.virtual_batch_size is not None:
      del reduction_axes[1]     # Do not reduce along virtual batch dim

    # Broadcasting only necessary for single-axis batch norm where the axis is
    # not the last dimension
    broadcast_shape = [1] * ndims
    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
    def _broadcast(v):
      if (v is not None and len(v.shape) != ndims and
          reduction_axes != list(range(ndims - 1))):
        return array_ops.reshape(v, broadcast_shape)
      return v

    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

    def _compose_transforms(scale, offset, then_scale, then_offset):
      if then_scale is not None:
        scale *= then_scale
        offset *= then_scale
      if then_offset is not None:
        offset += then_offset
      return (scale, offset)

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = tf_utils.constant_value(training)
    if training_value is not False:
      if self.adjustment:
        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
        # Adjust only during training.
        adj_scale = tf_utils.smart_cond(training,
                                        lambda: adj_scale,
                                        lambda: array_ops.ones_like(adj_scale))
        adj_bias = tf_utils.smart_cond(training,
                                       lambda: adj_bias,
                                       lambda: array_ops.zeros_like(adj_bias))
        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)

      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
      mean, variance = self._moments(
          math_ops.cast(inputs, self._param_dtype),
          reduction_axes,
          keep_dims=keep_dims)

      moving_mean = self.moving_mean
      moving_variance = self.moving_variance

      mean = tf_utils.smart_cond(training,
                                 lambda: mean,
                                 lambda: ops.convert_to_tensor(moving_mean))
      variance = tf_utils.smart_cond(
          training,
          lambda: variance,
          lambda: ops.convert_to_tensor(moving_variance))

      if self.virtual_batch_size is not None:
        # This isn't strictly correct since in ghost batch norm, you are
        # supposed to sequentially update the moving_mean and moving_variance
        # with each sub-batch. However, since the moving statistics are only
        # used during evaluation, it is more efficient to just update in one
        # step and should not make a significant difference in the result.
        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
      else:
        new_mean, new_variance = mean, variance

      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            new_mean, new_variance, training)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
        scale, offset = _compose_transforms(r, d, scale, offset)

      if distribution_strategy_context.in_cross_replica_context():
        strategy = distribution_strategy_context.get_strategy()

        def _do_update(var, value):
          """Compute the updates for mean and variance."""
          return strategy.extended.update(
              var, self._assign_moving_average, (value, self.momentum),
              group=False)
        # We need to unwrap the moving_mean or moving_variance in the case of
        # training being false to match the output of true_fn and false_fn
        # in the smart cond.
        def mean_update():
          true_branch = lambda: _do_update(self.moving_mean, new_mean)
          false_branch = lambda: strategy.unwrap(self.moving_mean)
          return tf_utils.smart_cond(training, true_branch, false_branch)

        def variance_update():
          return tf_utils.smart_cond(
              training, lambda: _do_update(self.moving_variance, new_variance),
              lambda: strategy.unwrap(self.moving_variance))
      else:
        def _do_update(var, value):
          """Compute the updates for mean and variance."""
          return self._assign_moving_average(var, value, self.momentum)

        def mean_update():
          true_branch = lambda: _do_update(self.moving_mean, new_mean)
          false_branch = lambda: self.moving_mean
          return tf_utils.smart_cond(training, true_branch, false_branch)

        def variance_update():
          true_branch = lambda: _do_update(self.moving_variance, new_variance)
          false_branch = lambda: self.moving_variance
          return tf_utils.smart_cond(training, true_branch, false_branch)

      self.add_update(mean_update, inputs=True)
      self.add_update(variance_update, inputs=True)

    else:
      mean, variance = self.moving_mean, self.moving_variance

    mean = math_ops.cast(mean, inputs.dtype)
    variance = math_ops.cast(variance, inputs.dtype)
    if offset is not None:
      offset = math_ops.cast(offset, inputs.dtype)
    if scale is not None:
      scale = math_ops.cast(scale, inputs.dtype)
    # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
    # math in float16 hurts validation accuracy of popular models like resnet.
    outputs = nn.batch_normalization(inputs,
                                     _broadcast(mean),
                                     _broadcast(variance),
                                     offset,
                                     scale,
                                     self.epsilon)
    # If some components of the shape got lost due to adjustments, fix that.
    outputs.set_shape(input_shape)

    if self.virtual_batch_size is not None:
      outputs = undo_virtual_batching(outputs)
    return outputs
コード例 #34
0
ファイル: beam_search.py プロジェクト: jack-and-rozz/seq2seq
    def loop_function(i, prev, state, log_beam_probs, beam_path, beam_symbols,
                      path_lengthes, is_finished_beam):
        output_size = prev.get_shape().as_list()[-1]
        state_size = state.get_shape().as_list()[-1]

        if i == 1:
            # todo: prevだけではなくstateも分岐
            probs = nn_ops.xw_plus_b(prev, output_projection[0],
                                     output_projection[1])
            probs = tf.log(tf.nn.softmax(probs))

            best_probs, indices = tf.nn.top_k(probs, beam_size)

            # initialize length and EOS flags for each beam.
            path_lengthes = tf.fill([batch_size, beam_size], 1.0)
            is_finished_beam = tf.fill([batch_size, beam_size], False)
            # expand previous states to beams. (e.g. batch_size=beam_size=2: [a, b] -> [a, a, b, b])
            prev = tf.gather(
                prev,
                tf.tile(tf.expand_dims(tf.range(batch_size), dim=1),
                        [1, beam_size]))
            # prev: [batch, beam, hidden] -> [batch * beam, hidden]
            prev = tf.reshape(prev, [-1, output_size])

            state = tf.gather(
                state,
                tf.tile(tf.expand_dims(tf.range(batch_size), dim=1),
                        [1, beam_size]))
            state = tf.reshape(state, [-1, state_size])

        else:
            probs = nn_ops.xw_plus_b(prev, output_projection[0],
                                     output_projection[1])
            probs = tf.log(tf.nn.softmax(probs))
            probs = tf.reshape(probs, [-1, beam_size * num_symbols])

            # divide probs by the length of each beam (length penalty) and select top-k.
            pl = tf.reshape(tile_from_beam_to_vocab(path_lengthes),
                            [-1, beam_size * num_symbols])
            best_probs, indices = tf.nn.top_k(probs / pl, beam_size)
        symbols = indices % num_symbols
        beam_parent = indices // num_symbols

        beam_symbols.append(symbols)
        beam_path.append(beam_parent)
        log_beam_probs.append(best_probs)

        is_finished_beam = tf.logical_or(
            tf.gather_nd(is_finished_beam, divide_index_by_batch(beam_parent)),
            tf.equal(symbols, tf.constant(EOS_ID)))

        path_lengthes = tf.gather_nd(path_lengthes,
                                     divide_index_by_batch(beam_parent))
        path_lengthes += tf.to_float(tf.logical_not(is_finished_beam))

        beam_state = tf.gather_nd(
            tf.reshape(state, [batch_size, beam_size, state_size]),
            divide_index_by_batch(beam_parent))
        emb_prev = embedding_ops.embedding_lookup(embedding, symbols)

        # [batch, beam, embedding] -> [batch * beam, embedding]
        beam_state = tf.reshape(beam_state, [-1, state_size])
        emb_prev = tf.reshape(emb_prev, [-1, embedding_size])
        if not update_embedding:
            emb_prev = array_ops.stop_gradient(emb_prev)
        return emb_prev, beam_state, path_lengthes, is_finished_beam
コード例 #35
0
 def score_function_with_advantage(dist_tensor, value, loss):
     with ops.name_scope(name, values=[value, loss]):
         advantage = advantage_fn(dist_tensor, loss)
         advantage = array_ops.stop_gradient(advantage)
         return dist_tensor.distribution.log_prob(value) * advantage
コード例 #36
0
    def __init__(self, **kwargs):
        np.random.seed(0)
        tf.set_random_seed(0)

        self.batch_size = kwargs.pop('batch_size')
        self.data_sets = kwargs.pop('data_sets')
        self.train_dir = kwargs.pop('train_dir', 'output')
        log_dir = kwargs.pop('log_dir', 'log')
        self.model_name = kwargs.pop('model_name')
        self.num_classes = kwargs.pop('num_classes')
        self.initial_learning_rate = kwargs.pop('initial_learning_rate')

        # if 'keep_probs' in kwargs: self.keep_probs = kwargs.pop('keep_probs')
        # else: self.keep_probs = None

        if 'mini_batch' in kwargs: self.mini_batch = kwargs.pop('mini_batch')
        else: self.mini_batch = True

        if 'damping' in kwargs: self.damping = kwargs.pop('damping')
        else: self.damping = 0.0

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        # Initialize session
        config = tf.ConfigProto()
        self.sess = tf.Session(config=config)
        K.set_session(self.sess)

        # Setup input
        self.input_placeholder, self.labels_placeholder = self.placeholder_inputs(
        )
        self.num_train_examples = self.data_sets.train.labels.shape[0]
        self.num_test_examples = self.data_sets.test.labels.shape[0]

        # Setup inference and training
        # if self.keep_probs is not None:
        #     self.keep_probs_placeholder = tf.placeholder(tf.float32, shape=(2))
        #     self.logits = self.inference(self.input_placeholder, self.keep_probs_placeholder)
        # elif hasattr(self, 'inference_needs_labels'):
        #     self.logits = self.inference(self.input_placeholder, self.labels_placeholder)
        # else:
        self.logits = self.inference(self.input_placeholder)

        self.total_loss, self.loss_no_reg, self.indiv_loss_no_reg = self.loss(
            self.logits, self.labels_placeholder)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.learning_rate = tf.Variable(self.initial_learning_rate,
                                         name='learning_rate',
                                         trainable=False)
        #self.learning_rate_placeholder = tf.placeholder(tf.float32)
        #self.update_learning_rate_op = tf.assign(self.learning_rate, self.learning_rate_placeholder)

        self.train_op = self.get_train_op(self.total_loss, self.global_step,
                                          self.learning_rate)
        self.train_op = self.get_train_sgd_op(self.total_loss,
                                              self.global_step,
                                              self.learning_rate)
        self.accuracy_op = self.get_accuracy_op(self.logits,
                                                self.labels_placeholder)
        self.preds = self.predictions(self.logits)

        # Setup misc
        self.saver = tf.train.Saver()

        # Setup gradients and Hessians
        self.params = self.get_all_params()
        self.grad_total_loss_op = tf.gradients(self.total_loss, self.params)
        self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params)
        self.v_placeholder = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in self.params
        ]
        self.u_placeholder = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in self.params
        ]

        self.hessian_vector = hessian_vector_product(self.total_loss,
                                                     self.params,
                                                     self.v_placeholder)

        self.grad_loss_wrt_input_op = tf.gradients(self.total_loss,
                                                   self.input_placeholder)

        # Because tf.gradients auto accumulates, we probably don't need the add_n (or even reduce_sum)
        self.influence_op = tf.add_n([
            tf.reduce_sum(tf.multiply(a, array_ops.stop_gradient(b)))
            for a, b in zip(self.grad_total_loss_op, self.v_placeholder)
        ])

        self.grad_influence_wrt_input_op = tf.gradients(
            self.influence_op, self.input_placeholder)

        self.checkpoint_file = os.path.join(self.train_dir,
                                            "%s-checkpoint" % self.model_name)

        self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(
            self.data_sets.train)
        self.all_test_feed_dict = self.fill_feed_dict_with_all_ex(
            self.data_sets.test)

        init = tf.global_variables_initializer()
        self.sess.run(init)

        self.vec_to_list = self.get_vec_to_list_fn()
コード例 #37
0
ファイル: losses_impl.py プロジェクト: idodan1/thesis
def softmax_cross_entropy(
    onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.

  `weights` acts as a coefficient for the loss. If a scalar is provided,
  then the loss is simply scaled by the given value. If `weights` is a
  tensor of shape `[batch_size]`, then the loss weights apply to each
  corresponding sample.

  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
      new_onehot_labels = onehot_labels * (1 - label_smoothing)
                          + label_smoothing / num_classes

  Note that `onehot_labels` and `logits` must have the same shape,
  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
  broadcastable to loss, whose shape is decided by the shape of `logits`.
  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
  a `Tensor` of shape `[batch_size]`.

  Args:
    onehot_labels: One-hot-encoded labels.
    logits: Logits outputs of the network.
    weights: Optional `Tensor` that is broadcastable to loss.
    label_smoothing: If greater than 0 then smooth the labels.
    scope: the scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
    `NONE`, this has shape `[batch_size]`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
      or if the shape of `weights` is invalid or if `weights` is None.  Also if
      `onehot_labels` or `logits` is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if onehot_labels is None:
    raise ValueError("onehot_labels must not be None.")
  if logits is None:
    raise ValueError("logits must not be None.")
  with ops.name_scope(scope, "softmax_cross_entropy_loss",
                      (logits, onehot_labels, weights)) as scope:
    logits = ops.convert_to_tensor(logits)
    onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
    logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())

    if label_smoothing > 0:
      num_classes = math_ops.cast(
          array_ops.shape(onehot_labels)[-1], logits.dtype)
      smooth_positives = 1.0 - label_smoothing
      smooth_negatives = label_smoothing / num_classes
      onehot_labels = onehot_labels * smooth_positives + smooth_negatives

    onehot_labels = array_ops.stop_gradient(
        onehot_labels, name="labels_stop_gradient")
    losses = nn.softmax_cross_entropy_with_logits_v2(
        labels=onehot_labels, logits=logits, name="xentropy")

    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
 def argmax(x):
     i = math_ops.argmax(x)
     return array_ops.stop_gradient(i)
コード例 #39
0
def surrogate_loss(sample_losses,
                   stochastic_tensors=None,
                   name="SurrogateLoss"):
    """Surrogate loss for stochastic graphs.

  This function will call `loss_fn` on each `StochasticTensor`
  upstream of `sample_losses`, passing the losses that it influenced.

  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
  instantiated in `while_loop`s or other control structures.

  Args:
    sample_losses: a list or tuple of final losses. Each loss should be per
      example in the batch (and possibly per sample); that is, it should have
      dimensionality of 1 or greater. All losses should have the same shape.
    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
      If None, defaults to all `StochasticTensor`s in the graph upstream of
      the `Tensor`s in `sample_losses`.
    name: the name with which to prepend created ops.

  Returns:
    `Tensor` loss, which is the sum of `sample_losses` and the
    `loss_fn`s returned by the `StochasticTensor`s.

  Raises:
    TypeError: if `sample_losses` is not a list or tuple, or if its elements
      are not `Tensor`s.
    ValueError: if any loss in `sample_losses` does not have dimensionality 1
      or greater.
  """
    with ops.name_scope(name, values=sample_losses):
        if not isinstance(sample_losses, (list, tuple)):
            raise TypeError("sample_losses must be a list or tuple")
        for loss in sample_losses:
            if not isinstance(loss, ops.Tensor):
                raise TypeError("loss is not a Tensor: %s" % loss)
            ndims = loss.get_shape().ndims
            if not (ndims is not None and ndims >= 1):
                raise ValueError(
                    "loss must have dimensionality 1 or greater: %s" % loss)

        stoch_dependencies_map = _stochastic_dependencies_map(
            sample_losses, stochastic_tensors=stochastic_tensors)
        if not stoch_dependencies_map:
            logging.warn(
                "No collection of Stochastic Tensors found for current graph.")
            return math_ops.add_n(sample_losses)

        # Iterate through all of the stochastic dependencies, adding
        # surrogate terms where necessary.
        sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
        loss_terms = sample_losses
        for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
            dependent_losses = list(dependent_losses)

            logging.info("Losses influenced by StochasticTensor %s: [%s]",
                         stoch_node.name,
                         ", ".join([loss.name for loss in dependent_losses]))

            # Sum up the downstream losses for this ST
            influenced_loss = _add_n_or_sum(dependent_losses)

            # Compute surrogate loss term
            loss_term = stoch_node.loss(
                array_ops.stop_gradient(influenced_loss))
            if loss_term is not None:
                loss_terms.append(loss_term)

        return _add_n_or_sum(loss_terms)
コード例 #40
0
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
                   resampling_temperature, partition_strategy):
    """A helper function for rank_sampled_softmax_loss.

  This computes, for each i in `sampled_values`,

      log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))

  where w_i, b_i are the weight and bias of the i-th class, respectively,
  and j ranges over the rows of `inputs`. For efficiency, we rearrange the
  computation to

      log(sum_j exp(w_i * (x_j / resampling_temperature))) +
          b_i / resampling_temperature.

  This translates to the following batched computation using tensorflow ops:

      reduce_logsumexp(matmul(embeddings,
                       transpose(inputs / resampling_temperature))) +
          biases / resampling_temperature

  The computation of the first term is colocated with the embeddings using
  `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second
  term, not the bottleneck, is computed at the worker.

  Args:
    weights: From `rank_sampled_softmax_loss`.
    biases: From `rank_sampled_softmax_loss`.
    inputs: From `rank_sampled_softmax_loss`.
    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
    num_resampled: An `int`. This many values are selected from
        `sampled_values` using the adaptive resampling algorithm. The caller
        must ensure that `num_resampled` is less than the size of
        `sampled_values`.
    resampling_temperature: A scalar `Tensor` with the temperature parameter
        for the adaptive resampling algorithm.
    partition_strategy: From `rank_sampled_softmax_loss`.

  Returns:
    A tuple of (`resampled_candidates`, `true_expected_count`,
        `resampled_expected_count`), similar to `sampled_values` but sampled
        down to `num_resampled` values.
  """
    # This code supports passing a Tensor for num_resampled, but since it is only
    # called with an int, that's what we specify in the arg list. If this
    # function is ever externalized, we should change the doc to support Tensor.

    sampled, true_expected_count, sampled_expected_count = sampled_values

    sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64)
    true_expected_count = array_ops.stop_gradient(true_expected_count)
    sampled_expected_count = array_ops.stop_gradient(sampled_expected_count)

    reweighted_inputs = inputs / resampling_temperature

    def logsumexp_logit(embeddings):
        return math_ops.reduce_logsumexp(math_ops.matmul(embeddings,
                                                         reweighted_inputs,
                                                         transpose_b=True),
                                         axis=1,
                                         keep_dims=False)

    # Calling this protected form of embedding_lookup allows co-locating
    # the logsumexp computation with the partitioned weights, which yields
    # a large speedup in practice.
    sampled_logits = embedding_ops._embedding_lookup_and_transform(  # pylint: disable=protected-access
        weights,
        sampled,
        partition_strategy,
        transform_fn=logsumexp_logit)
    sampled_b = array_ops.reshape(
        embedding_ops.embedding_lookup(biases, sampled, partition_strategy),
        [-1])
    sampled_logits += sampled_b / resampling_temperature

    _, resampled_indices = nn.top_k(sampled_logits,
                                    k=num_resampled,
                                    sorted=False)
    resampled = array_ops.gather(sampled, indices=resampled_indices)
    resampled_expected_count = array_ops.gather(sampled_expected_count,
                                                indices=resampled_indices)

    return resampled, true_expected_count, resampled_expected_count
コード例 #41
0
    def call(self, inputs, training=None, use_moving_statistics=True):
        """
        :param inputs: input features
        :param training: boolean or boolean Tensor (with shape []) which determines the current training phase
        :param use_moving_statistics: boolean or boolean Tensor (with shape []) which selects statistics to use
               when training==True (or the Tensor value) statistics (mean and variance) are from the inputs !
               when training==False, if use_moving_statistics==True -> feed forward with moving statistics (updated
                                        with operations defined in GraphKeys.UPDATE_OPS)
                                     else (use_moving_statistics==False -> feed forward with raw statistics (updated
                                        with operations from collections 'UPDATE_BN_OPS'
                                        'RESET_BN_OPS' contains operations to reset these vaiables between inferences.
        """
        in_eager_mode = context.executing_eagerly()
        if self.virtual_batch_size is not None:
            # Virtual batches (aka ghost batches) can be simulated by reshaping the
            # Tensor and reusing the existing batch norm implementation
            original_shape = [-1] + inputs.shape.as_list()[1:]
            expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

            # Will cause errors if virtual_batch_size does not divide the batch size
            inputs = array_ops.reshape(inputs, expanded_shape)

            def undo_virtual_batching(outputs):
                outputs = array_ops.reshape(outputs, original_shape)
                return outputs

        if self.fused:
            outputs = self._fused_batch_norm(
                inputs,
                training=training,
                use_moving_statistics=use_moving_statistics)
            if self.virtual_batch_size is not None:
                # Currently never reaches here since fused_batch_norm does not support
                # virtual batching
                outputs = undo_virtual_batching(outputs)
            return outputs

        # Compute the axes along which to reduce the mean / variance
        input_shape = inputs.get_shape()
        ndims = len(input_shape)
        reduction_axes = [i for i in range(ndims) if i not in self.axis]
        if self.virtual_batch_size is not None:
            del reduction_axes[1]  # Do not reduce along virtual batch dim

        # Broadcasting only necessary for single-axis batch norm where the axis is
        # not the last dimension
        broadcast_shape = [1] * ndims
        broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value

        def _broadcast(v):
            if (v is not None and len(v.get_shape()) != ndims
                    and reduction_axes != list(range(ndims - 1))):
                return array_ops.reshape(v, broadcast_shape)
            return v

        scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

        def _compose_transforms(scale, offset, then_scale, then_offset):
            if then_scale is not None:
                scale *= then_scale
                offset *= then_scale
            if then_offset is not None:
                offset += then_offset
            return (scale, offset)

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = tf_utils.constant_value(training)

        if training_value is not False:
            if self.adjustment:
                adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
                # Adjust only during training.
                adj_scale = tf_utils.smart_cond(
                    training, lambda: adj_scale,
                    lambda: array_ops.ones_like(adj_scale))
                adj_bias = tf_utils.smart_cond(
                    training, lambda: adj_bias,
                    lambda: array_ops.zeros_like(adj_bias))
                scale, offset = _compose_transforms(adj_scale, adj_bias, scale,
                                                    offset)

            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            keep_dims = self.virtual_batch_size is not None or len(
                self.axis) > 1

            # mean and variance of the current batch
            mean, variance = nn.moments(inputs,
                                        reduction_axes,
                                        keep_dims=keep_dims)

            mean = tf_utils.smart_cond(
                training, lambda: mean,
                lambda: tf_utils.smart_cond(use_moving_statistics, lambda: self
                                            .moving_mean, lambda: self.mean))
            variance = tf_utils.smart_cond(
                training, lambda: variance, lambda: tf_utils.smart_cond(
                    use_moving_statistics, lambda: self.moving_variance,
                    lambda: self.variance))

            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    mean, variance, training)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
                d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
                scale, offset = _compose_transforms(r, d, scale, offset)
            else:
                new_mean, new_variance = mean, variance

            if self.virtual_batch_size is not None:
                # This isn't strictly correct since in ghost batch norm, you are
                # supposed to sequentially update the moving_mean and moving_variance
                # with each sub-batch. However, since the moving statistics are only
                # used during evaluation, it is more efficient to just update in one
                # step and should not make a significant difference in the result.
                new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
                new_variance = math_ops.reduce_mean(variance,
                                                    axis=1,
                                                    keepdims=True)

            def _do_update(var, value):
                if in_eager_mode and not self.trainable:
                    return
                return self._assign_moving_average(var, value, self.momentum)

            moving_mean_update = tf_utils.smart_cond(
                training, lambda: _do_update(self.moving_mean, new_mean),
                lambda: self.moving_mean)
            moving_variance_update = tf_utils.smart_cond(
                training,
                lambda: _do_update(self.moving_variance, new_variance),
                lambda: self.moving_variance)

            if not context.executing_eagerly():
                self.add_update(moving_mean_update, inputs=True)
                self.add_update(moving_variance_update, inputs=True)

            mean_update = self._update_statistics(self.mean, mean,
                                                  self.n_updates)
            variance_update = self._update_statistics(self.variance, variance,
                                                      self.n_updates)

            with ops.control_dependencies([mean_update, variance_update]):
                # update n_updates only after updating self.mean and self.variance
                update_n_updates = state_ops.assign_add(self.n_updates, 1.)
                ops.add_to_collection('UPDATE_BN_OPS', update_n_updates)

            reset_mean = state_ops.assign(self.mean,
                                          array_ops.zeros_like(self.mean))
            reset_variance = state_ops.assign(
                self.variance, array_ops.zeros_like(self.variance))
            reset_n_updates = state_ops.assign(self.n_updates, 0.)
            with ops.control_dependencies(
                [reset_mean, reset_variance, reset_n_updates]):
                reset_bn = gen_control_flow_ops.no_op("ResetBatchNormStats")
            ops.add_to_collection('RESET_OPS', reset_bn)

        else:
            # training == False
            mean = tf_utils.smart_cond(use_moving_statistics,
                                       lambda: self.moving_mean,
                                       lambda: self.mean)
            variance = tf_utils.smart_cond(use_moving_statistics,
                                           lambda: self.moving_variance,
                                           lambda: self.variance)

        mean = math_ops.cast(mean, inputs.dtype)
        variance = math_ops.cast(variance, inputs.dtype)
        if offset is not None:
            offset = math_ops.cast(offset, inputs.dtype)
        outputs = nn.batch_normalization(inputs, _broadcast(mean),
                                         _broadcast(variance), offset, scale,
                                         self.epsilon)
        # If some components of the shape got lost due to adjustments, fix that.
        outputs.set_shape(input_shape)

        if self.virtual_batch_size is not None:
            outputs = undo_virtual_batching(outputs)

        return outputs
コード例 #42
0
def dense_to_csr_sparse_matrix(dense):
    dense_t = ops.convert_to_tensor(dense)
    locs = array_ops.stop_gradient(array_ops.where(math_ops.abs(dense_t) > 0))
    return sparse_csr_matrix_ops.dense_to_csr_sparse_matrix(dense_t, locs)
コード例 #43
0
    def call(self, inputs, training=False):
        if self.fused:
            return self._fused_batch_norm(inputs, training=training)

        # First, compute the axes along which to reduce the mean / variance,
        # as well as the broadcast shape to be used for all parameters.
        input_shape = inputs.get_shape()
        ndim = len(input_shape)
        reduction_axes = list(range(len(input_shape)))
        del reduction_axes[self.axis]
        broadcast_shape = [1] * len(input_shape)
        broadcast_shape[self.axis] = input_shape[self.axis].value

        # Determines whether broadcasting is needed.
        needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])

        scale, offset = self.gamma, self.beta

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = utils.constant_value(training)
        if training_value is not False:
            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            mean, variance = nn.moments(inputs, reduction_axes)
            mean = _smart_select(training, lambda: mean,
                                 lambda: self.moving_mean)
            variance = _smart_select(training, lambda: variance,
                                     lambda: self.moving_variance)

            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    mean, variance, training)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                scale = array_ops.stop_gradient(r, name='renorm_r')
                offset = array_ops.stop_gradient(d, name='renorm_d')
                if self.gamma is not None:
                    scale *= self.gamma
                    offset *= self.gamma
                if self.beta is not None:
                    offset += self.beta
            else:
                new_mean, new_variance = mean, variance

            # Update moving averages when training, and prevent updates otherwise.
            decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
            mean_update = moving_averages.assign_moving_average(
                self.moving_mean, new_mean, decay, zero_debias=False)
            variance_update = moving_averages.assign_moving_average(
                self.moving_variance, new_variance, decay, zero_debias=False)

            self.add_update(mean_update, inputs=inputs)
            self.add_update(variance_update, inputs=inputs)

        else:
            mean, variance = self.moving_mean, self.moving_variance

        def _broadcast(v):
            if needs_broadcasting and v is not None:
                # In this case we must explicitly broadcast all parameters.
                return array_ops.reshape(v, broadcast_shape)
            return v

        return nn.batch_normalization(inputs, _broadcast(mean),
                                      _broadcast(variance), _broadcast(offset),
                                      _broadcast(scale), self.epsilon)
コード例 #44
0
 def testStopGradient(self):
     with ops.Graph().as_default():
         inp = constant(1.0, shape=[100, 32], name="in")
         out = array_ops.stop_gradient(inp)
         igrad = gradients.gradients(out, inp)[0]
     assert igrad is None
コード例 #45
0
def _pairwise_comparison(sorted_labels,
                         sorted_logits,
                         sorted_weights,
                         lambda_weight=None):
    r"""Returns pairwise comparison `Tensor`s.

  Given a list of n items, the labels of graded relevance l_i and the logits
  s_i, we sort the items in a list based on s_i and obtain ranks r_i. We form
  n^2 pairs of items. For each pair, we have the following:

                        /
                        | 1   if l_i > l_j
  * `pairwise_labels` = |
                        | 0   if l_i <= l_j
                        \
  * `pairwise_logits` = s_i - s_j
                         /
                         | 0              if l_i <= l_j,
  * `pairwise_weights` = | |l_i - l_j|    if lambda_weight is None,
                         | lambda_weight  otherwise.
                         \

  The `sorted_weights` is item-wise and is applied non-symmetrically to update
  pairwise_weights as
    pairwise_weights(i, j) = w_i * pairwise_weights(i, j).
  This effectively applies to all pairs with l_i > l_j. Note that it is actually
  symmetric when `sorted_weights` are constant per list, i.e., listwise weights.

  Args:
    sorted_labels: A `Tensor` with shape [batch_size, list_size] of labels
      sorted.
    sorted_logits: A `Tensor` with shape [batch_size, list_size] of logits
      sorted.
    sorted_weights: A `Tensor` with shape [batch_size, list_size] of item-wise
      weights sorted.
    lambda_weight: A `_LambdaWeight` object.

  Returns:
    A tuple of (pairwise_labels, pairwise_logits, pairwise_weights) with each
    having the shape [batch_size, list_size, list_size].
  """
    # Compute the difference for all pairs in a list. The output is a Tensor with
    # shape [batch_size, list_size, list_size] where the entry [-1, i, j] stores
    # the information for pair (i, j).
    pairwise_label_diff = array_ops.expand_dims(
        sorted_labels, 2) - array_ops.expand_dims(sorted_labels, 1)
    pairwise_logits = array_ops.expand_dims(
        sorted_logits, 2) - array_ops.expand_dims(sorted_logits, 1)
    pairwise_labels = math_ops.to_float(
        math_ops.greater(pairwise_label_diff, 0))
    is_label_valid = utils.is_label_valid(sorted_labels)
    valid_pair = math_ops.logical_and(array_ops.expand_dims(is_label_valid, 2),
                                      array_ops.expand_dims(is_label_valid, 1))
    # Only keep the case when l_i > l_j.
    pairwise_weights = pairwise_labels * math_ops.to_float(valid_pair)
    # Apply the item-wise weights along l_i.
    pairwise_weights *= array_ops.expand_dims(sorted_weights, 2)
    if lambda_weight is not None:
        pairwise_weights *= lambda_weight.pair_weights(sorted_labels)
    else:
        pairwise_weights *= math_ops.abs(pairwise_label_diff)
    pairwise_weights = array_ops.stop_gradient(pairwise_weights,
                                               name='weights_stop_gradient')
    return pairwise_labels, pairwise_logits, pairwise_weights
コード例 #46
0
    def __init__(self, rnn_cell, seq_inputs, initial_state):
        assert initial_state is not None

        # TODO(drpng): Dtype needs to be configurable.
        input_dtypes = [seq_inputs.dtype
                        ] + _GetDTypesFromStructure(initial_state)
        # See _index.
        like_inputs_t = nest.map_structure(
            lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)),
            seq_inputs)
        input_structure = (like_inputs_t, initial_state)

        @function.Defun(*input_dtypes)
        def FlatCellStep(*flat_inputs):
            """The flattened version of `rnn_cell`."""
            inputs_t, state0 = nest.pack_sequence_as(input_structure,
                                                     flat_inputs)
            _SetShapeFromTemplate(state0, initial_state)
            _SetShapeFromTemplate(inputs_t, like_inputs_t)
            outputs_t, state1 = rnn_cell(inputs_t, state0)
            state_list = nest.flatten(state1)
            self._output_shape = outputs_t.shape

            if outputs_t in state_list:
                output_index_in_state = state_list.index(outputs_t)
            else:
                output_index_in_state = None

            if output_index_in_state is None:
                self._prepend_output = True
                self._output_state_idx = 0
                return [outputs_t] + state_list
            else:
                self._output_state_idx = output_index_in_state
                self._prepend_output = False
                # To save memory, we don't store return the output separately
                # from the state list, since we know it's the same.
                return state_list

        def _ToPureFunction(func):
            # NOTE: This forces the creating of the function.
            if func.captured_inputs:
                pure_func = copy.copy(func)
                # pylint: disable=protected-access
                pure_func._extra_inputs = []
                return pure_func
            return func

        pure_flat_cell_step = _ToPureFunction(FlatCellStep)

        def CellStep(theta, extended_state0, inputs_t):
            """Performs one time steps on structured inputs.

      The purpose of this function is to turn the parameters into flattened
      versions, and to resolve the parameter order difference between
      `Recurrent` and `RNNCell`.

      In the event the cell returns a transformed output that is not aliased
      within its state, the `extended_state0` also contains the output as its
      first element.

      Args:
        theta: Weights required for the computation. A structure of tensors.
        extended_state0: the state0, and possibly the output at the previous
          time step. A structure of tensors.
        inputs_t: the inputs at time t.

      Returns:
        A pair of the next state (inclusive of the output), and an empty list
        (unused `extras`).
        The next state is congruent to state0.
      """
            extended_state0_flat = nest.flatten(extended_state0)
            state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat)
            full_inputs = [inputs_t] + state0_flat + theta
            # Note that the thetas are additional inputs appeneded as extra
            # parameters.
            cell_out = pure_flat_cell_step(*full_inputs)
            return cell_out, []

        self._cell_step = CellStep
        self._theta = FlatCellStep.captured_inputs
        self._zero_state = rnn_cell.zero_state
        self._state_template = initial_state
        self._output_size = rnn_cell.output_size
コード例 #47
0
 def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
     with ops.name_scope(name, "clip_by_value_preserve_grad",[x, clip_value_min, clip_value_max]):
         clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
         return x + array_ops.stop_gradient(clip_x - x)
コード例 #48
0
  def call(self, inputs, training=False):
    in_eager_mode = context.executing_eagerly()
    if self.virtual_batch_size is not None:
      # Virtual batches (aka ghost batches) can be simulated by reshaping the
      # Tensor and reusing the existing batch norm implementation
      original_shape = [-1] + inputs.shape.as_list()[1:]
      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

      # Will cause errors if virtual_batch_size does not divide the batch size
      inputs = array_ops.reshape(inputs, expanded_shape)

      def undo_virtual_batching(outputs):
        outputs = array_ops.reshape(outputs, original_shape)
        return outputs

    if self.fused:
      outputs = self._fused_batch_norm(inputs, training=training)
      if self.virtual_batch_size is not None:
        # Currently never reaches here since fused_batch_norm does not support
        # virtual batching
        return undo_virtual_batching(outputs)
      return outputs

    # Compute the axes along which to reduce the mean / variance
    input_shape = inputs.get_shape()
    ndims = len(input_shape)
    reduction_axes = [i for i in range(ndims) if i not in self.axis]
    if self.virtual_batch_size is not None:
      del reduction_axes[1]     # Do not reduce along virtual batch dim

    # Broadcasting only necessary for single-axis batch norm where the axis is
    # not the last dimension
    broadcast_shape = [1] * ndims
    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
    def _broadcast(v):
      if (v is not None and
          len(v.get_shape()) != ndims and
          reduction_axes != list(range(ndims - 1))):
        return array_ops.reshape(v, broadcast_shape)
      return v

    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

    def _compose_transforms(scale, offset, then_scale, then_offset):
      if then_scale is not None:
        scale *= then_scale
        offset *= then_scale
      if then_offset is not None:
        offset += then_offset
      return (scale, offset)

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = utils.constant_value(training)
    if training_value is not False:
      if self.adjustment:
        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
        # Adjust only during training.
        adj_scale = utils.smart_cond(training,
                                     lambda: adj_scale,
                                     lambda: array_ops.ones_like(adj_scale))
        adj_bias = utils.smart_cond(training,
                                    lambda: adj_bias,
                                    lambda: array_ops.zeros_like(adj_bias))
        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)

      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)

      moving_mean = self.moving_mean
      moving_variance = self.moving_variance

      mean = utils.smart_cond(training,
                              lambda: mean,
                              lambda: moving_mean)
      variance = utils.smart_cond(training,
                                  lambda: variance,
                                  lambda: moving_variance)

      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            mean, variance, training)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
        scale, offset = _compose_transforms(r, d, scale, offset)
      else:
        new_mean, new_variance = mean, variance

      if self.virtual_batch_size is not None:
        # This isn't strictly correct since in ghost batch norm, you are
        # supposed to sequentially update the moving_mean and moving_variance
        # with each sub-batch. However, since the moving statistics are only
        # used during evaluation, it is more efficient to just update in one
        # step and should not make a significant difference in the result.
        new_mean = math_ops.reduce_mean(new_mean,
                                        axis=1, keep_dims=True)
        new_variance = math_ops.reduce_mean(new_variance,
                                            axis=1, keep_dims=True)

      def _do_update(var, value):
        if in_eager_mode and not self.trainable:
          return

        return moving_averages.assign_moving_average(
            var, value, self.momentum, zero_debias=False)

      mean_update = utils.smart_cond(
          training,
          lambda: _do_update(self.moving_mean, new_mean),
          lambda: self.moving_mean)
      variance_update = utils.smart_cond(
          training,
          lambda: _do_update(self.moving_variance, new_variance),
          lambda: self.moving_variance)
      if not context.executing_eagerly():
        self.add_update(mean_update, inputs=inputs)
        self.add_update(variance_update, inputs=inputs)

    else:
      mean, variance = self.moving_mean, self.moving_variance

    outputs = nn.batch_normalization(inputs,
                                     _broadcast(mean),
                                     _broadcast(variance),
                                     offset,
                                     scale,
                                     self.epsilon)
    # If some components of the shape got lost due to adjustments, fix that.
    outputs.set_shape(input_shape)

    if self.virtual_batch_size is not None:
      return undo_virtual_batching(outputs)

    return outputs
コード例 #49
0
    def call(self, inputs, training=False):
        if self.num_virtual_batches > 1:
            # Virtual batches (aka ghost batches) can be simulated by using some
            # reshape/transpose tricks on top of base batch normalization.
            original_shape = [-1] + inputs.shape.as_list()[1:]
            expanded_shape = [-1, self.num_virtual_batches
                              ] + original_shape[1:]

            # Will cause errors if num_virtual_batches does not divide the batch size
            inputs = array_ops.reshape(inputs, expanded_shape)

            ndims = len(expanded_shape)
            if self.axis < 0:
                axis = ndims + self.axis
            else:
                axis = self.axis + 1  # Account for the added dimension

            # Permute the num_virtual_batch dimension (dim 1) to be adjacent to axis
            # TODO(b/66257056): when multi-axis batch normalization is implemented,
            # this permutation trick and the combined_dim reshape are no longer
            # necessary and can be reworked to simply use broadcasting.
            permutation = ([0] + list(range(2, axis)) + [1, axis] +
                           list(range(axis + 1, ndims)))
            inverse_permutation = [
                x[1] for x in sorted(zip(permutation, range(ndims)))
            ]
            inputs = array_ops.transpose(inputs, perm=permutation)

            # Combine the axis and num_virtual_batch dimension in order to take
            # advantage of fused batch normalization
            combined_dim = expanded_shape[1] * expanded_shape[axis]
            perm_shape = [-1] + inputs.shape.as_list()[1:]
            combined_shape = (perm_shape[:axis - 1] + [combined_dim] +
                              perm_shape[axis + 1:])
            inputs = array_ops.reshape(inputs, combined_shape)

            # After the above reshape, the batch norm axis is the original self.axis

            # Undoes the reshaping and transposing tricks done above
            def undo_virtual_batching(outputs):
                outputs = array_ops.reshape(outputs, perm_shape)
                outputs = array_ops.transpose(outputs,
                                              perm=inverse_permutation)
                outputs = array_ops.reshape(outputs, original_shape)
                return outputs

        if self.fused:
            outputs = self._fused_batch_norm(inputs, training=training)
            if self.num_virtual_batches > 1:
                return undo_virtual_batching(outputs)
            return outputs

        # First, compute the axes along which to reduce the mean / variance,
        # as well as the broadcast shape to be used for all parameters.
        input_shape = inputs.get_shape()
        ndim = len(input_shape)
        reduction_axes = list(range(len(input_shape)))
        del reduction_axes[self.axis]
        broadcast_shape = [1] * len(input_shape)
        broadcast_shape[self.axis] = input_shape[self.axis].value

        # Determines whether broadcasting is needed.
        needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])

        scale, offset = self.gamma, self.beta

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = utils.constant_value(training)
        if training_value is not False:
            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            mean, variance = nn.moments(inputs, reduction_axes)
            mean = _smart_select(training, lambda: mean,
                                 lambda: self.moving_mean)
            variance = _smart_select(training, lambda: variance,
                                     lambda: self.moving_variance)

            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    mean, variance, training)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                scale = array_ops.stop_gradient(r, name='renorm_r')
                offset = array_ops.stop_gradient(d, name='renorm_d')
                if self.gamma is not None:
                    scale *= self.gamma
                    offset *= self.gamma
                if self.beta is not None:
                    offset += self.beta
            else:
                new_mean, new_variance = mean, variance

            # Update moving averages when training, and prevent updates otherwise.
            decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
            mean_update = moving_averages.assign_moving_average(
                self.moving_mean, new_mean, decay, zero_debias=False)
            variance_update = moving_averages.assign_moving_average(
                self.moving_variance, new_variance, decay, zero_debias=False)
            if context.in_graph_mode():
                self.add_update(mean_update, inputs=inputs)
                self.add_update(variance_update, inputs=inputs)

        else:
            mean, variance = self.moving_mean, self.moving_variance

        def _broadcast(v):
            if needs_broadcasting and v is not None:
                # In this case we must explicitly broadcast all parameters.
                return array_ops.reshape(v, broadcast_shape)
            return v

        outputs = nn.batch_normalization(inputs, _broadcast(mean),
                                         _broadcast(variance),
                                         _broadcast(offset), _broadcast(scale),
                                         self.epsilon)

        if self.num_virtual_batches > 1:
            return undo_virtual_batching(outputs)

        return outputs
コード例 #50
0
#   INIT
# *************************************
tf.global_variables_initializer().run()
# *************************************
#   NCE STAGING
# *************************************
noise_sampler = candidate_sampling_ops.uniform_candidate_sampler(
    true_classes=input_labels.tensor,
    num_true=1,
    num_sampled=nce_samples,
    unique=True,
    range_max=vocab_size,
    seed=None)

sampled, true_expected_count, sampled_expected_count = (
    array_ops.stop_gradient(s) for s in noise_sampler)

print("adaptive sample: ", tf.shape(noise_logits.tensor).eval())

print("[noise sample shape] {}".format(tf.shape(sampled).eval()))

labels_flat = array_ops.reshape(input_labels.tensor, [-1])

true_ris = tx.gather_sparse(sp_tensor=ri_tensor, ids=labels_flat)
noise_ris = tx.gather_sparse(sp_tensor=ri_tensor, ids=sampled)

print("----")
print("[true_ri shape] {}".format(tf.shape(true_ris).eval()))
print("[noise_ri shape] {}".format(tf.shape(noise_ris).eval()))
print("----")
コード例 #51
0
ファイル: loss_functions.py プロジェクト: wmpscc/tensorflow-1
 def evaluate_on_sample(self, seed=None):
     if seed is None:
         seed = self._default_seed
     # We treat the targets as "constant".  It's only the inputs that get
     # "back-propped" through.
     return self._evaluate(array_ops.stop_gradient(self.sample(seed)))
コード例 #52
0
 def testStopGradient(self):
   input_ = np.random.rand(4, 7)
   tf_val = array_ops.stop_gradient(input_)
   c_val = tensor_util.constant_value(tf_val)
   self.assertAllEqual(input_, c_val)
コード例 #53
0
 def embedFunc(y):
     emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
     if not update_embedding:
         emb_prev = array_ops.stop_gradient(emb_prev)
     return emb_prev
コード例 #54
0
ファイル: losses_impl.py プロジェクト: hukim1112/lab1
def combine_adversarial_loss(main_loss,
                             adversarial_loss,
                             weight_factor=None,
                             gradient_ratio=None,
                             gradient_ratio_epsilon=1e-6,
                             variables=None,
                             scalar_summaries=True,
                             gradient_summaries=True,
                             scope=None):
    """Utility to combine main and adversarial losses.

  This utility combines the main and adversarial losses in one of two ways.
  1) Fixed coefficient on adversarial loss. Use `weight_factor` in this case.
  2) Fixed ratio of gradients. Use `gradient_ratio` in this case. This is often
    used to make sure both losses affect weights roughly equally, as in
    https://arxiv.org/pdf/1705.05823.

  One can optionally also visualize the scalar and gradient behavior of the
  losses.

  Args:
    main_loss: A floating scalar Tensor indicating the main loss.
    adversarial_loss: A floating scalar Tensor indication the adversarial loss.
    weight_factor: If not `None`, the coefficient by which to multiply the
      adversarial loss. Exactly one of this and `gradient_ratio` must be
      non-None.
    gradient_ratio: If not `None`, the ratio of the magnitude of the gradients.
      Specifically,
        gradient_ratio = grad_mag(main_loss) / grad_mag(adversarial_loss)
      Exactly one of this and `weight_factor` must be non-None.
    gradient_ratio_epsilon: An epsilon to add to the adversarial loss
      coefficient denominator, to avoid division-by-zero.
    variables: List of variables to calculate gradients with respect to. If not
      present, defaults to all trainable variables.
    scalar_summaries: Create scalar summaries of losses.
    gradient_summaries: Create gradient summaries of losses.
    scope: Optional name scope.

  Returns:
    A floating scalar Tensor indicating the desired combined loss.

  Raises:
    ValueError: Malformed input.
  """
    _validate_args([main_loss, adversarial_loss], weight_factor,
                   gradient_ratio)
    if variables is None:
        variables = contrib_variables_lib.get_trainable_variables()

    with ops.name_scope(scope,
                        'adversarial_loss',
                        values=[main_loss, adversarial_loss]):
        # Compute gradients if we will need them.
        if gradient_summaries or gradient_ratio is not None:
            main_loss_grad_mag = _numerically_stable_global_norm(
                gradients_impl.gradients(main_loss, variables))
            adv_loss_grad_mag = _numerically_stable_global_norm(
                gradients_impl.gradients(adversarial_loss, variables))

        # Add summaries, if applicable.
        if scalar_summaries:
            summary.scalar('main_loss', main_loss)
            summary.scalar('adversarial_loss', adversarial_loss)
        if gradient_summaries:
            summary.scalar('main_loss_gradients', main_loss_grad_mag)
            summary.scalar('adversarial_loss_gradients', adv_loss_grad_mag)

        # Combine losses in the appropriate way.
        # If `weight_factor` is always `0`, avoid computing the adversarial loss
        # tensor entirely.
        if _used_weight((weight_factor, gradient_ratio)) == 0:
            final_loss = main_loss
        elif weight_factor is not None:
            final_loss = (
                main_loss +
                array_ops.stop_gradient(weight_factor) * adversarial_loss)
        elif gradient_ratio is not None:
            grad_mag_ratio = main_loss_grad_mag / (adv_loss_grad_mag +
                                                   gradient_ratio_epsilon)
            adv_coeff = grad_mag_ratio / gradient_ratio
            summary.scalar('adversarial_coefficient', adv_coeff)
            final_loss = (
                main_loss +
                array_ops.stop_gradient(adv_coeff) * adversarial_loss)

    return final_loss