def testNegativeBinomialSample(self):
    with self.cached_session() as sess:
      probs = [.3, .9]
      total_count = [4., 11.]
      n = int(100e3)
      negbinom = negative_binomial.NegativeBinomial(
          total_count=total_count, probs=probs)

      samples = negbinom.sample(n, seed=12345)
      self.assertEqual([n, 2], samples.get_shape())

      sample_mean = math_ops.reduce_mean(samples, axis=0)
      sample_var = math_ops.reduce_mean(
          (samples - sample_mean[array_ops.newaxis, ...])**2., axis=0)
      sample_min = math_ops.reduce_min(samples)
      [sample_mean_, sample_var_, sample_min_] = sess.run([
          sample_mean, sample_var, sample_min])
      self.assertAllEqual(np.ones(sample_min_.shape, dtype=np.bool),
                          sample_min_ >= 0.0)
      for i in range(2):
        self.assertAllClose(sample_mean_[i],
                            stats.nbinom.mean(total_count[i], 1 - probs[i]),
                            atol=0.,
                            rtol=.02)
        self.assertAllClose(sample_var_[i],
                            stats.nbinom.var(total_count[i], 1 - probs[i]),
                            atol=0.,
                            rtol=.02)
def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
  """Compute the approximate sliced Wasserstein distance.

  Args:
      a: (matrix) Distribution "a" of samples (row, col).
      b: (matrix) Distribution "b" of samples (row, col).
      random_sampling_count: (int) Number of random projections to average.
      random_projection_dim: (int) Dimension of the random projection space.
  Returns:
      Float containing the approximate distance between "a" and "b".
  """
  s = array_ops.shape(a)
  means = []
  for _ in range(random_sampling_count):
    # Random projection matrix.
    proj = random_ops.random_normal(
        [array_ops.shape(a)[1], random_projection_dim])
    proj *= math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
    # Project both distributions and sort them.
    proj_a = math_ops.matmul(a, proj)
    proj_b = math_ops.matmul(b, proj)
    proj_a = _sort_rows(proj_a, s[0])
    proj_b = _sort_rows(proj_b, s[0])
    # Pairwise Wasserstein distance.
    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
    means.append(wdist)
  return math_ops.reduce_mean(means)
def _potential_scale_reduction_single_state(state, independent_chain_ndims):
  """potential_scale_reduction for one single state `Tensor`."""
  # We assume exactly one leading dimension indexes e.g. correlated samples from
  # each Markov chain.
  state = ops.convert_to_tensor(state, name="state")
  sample_ndims = 1

  sample_axis = math_ops.range(0, sample_ndims)
  chain_axis = math_ops.range(sample_ndims,
                              sample_ndims + independent_chain_ndims)
  sample_and_chain_axis = math_ops.range(0,
                                         sample_ndims + independent_chain_ndims)

  n = _axis_size(state, sample_axis)
  m = _axis_size(state, chain_axis)

  # In the language of [2],
  # B / n is the between chain variance, the variance of the chain means.
  # W is the within sequence variance, the mean of the chain variances.
  b_div_n = _reduce_variance(
      math_ops.reduce_mean(state, sample_axis, keepdims=True),
      sample_and_chain_axis,
      biased=False)
  w = math_ops.reduce_mean(
      _reduce_variance(state, sample_axis, keepdims=True, biased=True),
      sample_and_chain_axis)

  # sigma^2_+ is an estimate of the true variance, which would be unbiased if
  # each chain was drawn from the target.  c.f. "law of total variance."
  sigma_2_plus = w + b_div_n

  return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
def _statistics(x, axes):
  """Calculate the mean and mean square of `x`.

  Modified from the implementation of `tf.nn.moments`.

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.

  Returns:
    Two `Tensor` objects: `mean` and `square mean`.
  """
  # The dynamic range of fp16 is too limited to support the collection of
  # sufficient statistics. As a workaround we simply perform the operations
  # on 32-bit floats before converting the mean and variance back to fp16
  y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x

  # Compute true mean while keeping the dims for proper broadcasting.
  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))

  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
  mean = shifted_mean + shift
  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)

  mean = array_ops.squeeze(mean, axes)
  mean_squared = array_ops.squeeze(mean_squared, axes)
  if x.dtype == dtypes.float16:
    return (math_ops.cast(mean, dtypes.float16),
            math_ops.cast(mean_squared, dtypes.float16))
  else:
    return (mean, mean_squared)
 def center_bias(self, center_bias_var, gradients, hessians):
   # For in memory, we already have a full batch of gradients and hessians,
   # so just take a mean and proceed with centering.
   mean_gradients = array_ops.expand_dims(
       math_ops.reduce_mean(gradients, 0), 0)
   mean_heassians = array_ops.expand_dims(math_ops.reduce_mean(hessians, 0), 0)
   return self._center_bias_fn(center_bias_var, mean_gradients, mean_heassians)
  def test_docstring_example(self):
    # Produce the first 1000 members of the Halton sequence in 3 dimensions.
    num_results = 1000
    dim = 3
    with self.test_session():
      sample = halton.sample(dim, num_results=num_results, randomized=False)

      # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
      # hypercube.
      powers = math_ops.range(1.0, limit=dim + 1)
      integral = math_ops.reduce_mean(
          math_ops.reduce_prod(sample ** powers, axis=-1))
      true_value = 1.0 / math_ops.reduce_prod(powers + 1.0)

      # Produces a relative absolute error of 1.7%.
      self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02)

      # Now skip the first 1000 samples and recompute the integral with the next
      # thousand samples. The sequence_indices argument can be used to do this.

      sequence_indices = math_ops.range(start=1000, limit=1000 + num_results,
                                        dtype=dtypes.int32)
      sample_leaped = halton.sample(dim, sequence_indices=sequence_indices,
                                    randomized=False)

      integral_leaped = math_ops.reduce_mean(
          math_ops.reduce_prod(sample_leaped ** powers, axis=-1))
      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.05)
def npairs_loss(labels, embeddings_anchor, embeddings_positive,
                reg_lambda=0.002, print_losses=False):
  """Computes the npairs loss.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels.

  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
    embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

  Returns:
    npairs_loss: tf.float32 scalar.
  """
  # pylint: enable=line-too-long
  # Add the regularizer on the embedding.
  reg_anchor = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
  reg_positive = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
  l2loss = math_ops.multiply(
      0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss')

  # Get per pair similarities.
  similarity_matrix = math_ops.matmul(
      embeddings_anchor, embeddings_positive, transpose_a=False,
      transpose_b=True)

  # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
  lshape = array_ops.shape(labels)
  assert lshape.shape == 1
  labels = array_ops.reshape(labels, [lshape[0], 1])

  labels_remapped = math_ops.to_float(
      math_ops.equal(labels, array_ops.transpose(labels)))
  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

  # Add the softmax loss.
  xent_loss = nn.softmax_cross_entropy_with_logits(
      logits=similarity_matrix, labels=labels_remapped)
  xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

  if print_losses:
    xent_loss = logging_ops.Print(
        xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

  return l2loss + xent_loss
  def training_loss(self, logits, target, features, name="training_loss"):
    """Returns training loss tensor for this head.

    Training loss is different from the loss reported on the tensorboard as we
    should respect the example weights when computing the gradient.

      L = sum_{i} w_{i} * l_{i} / B

    where B is the number of examples in the batch, l_{i}, w_{i} are individual
    losses, and example weight.

    Args:
      logits: logits, a float tensor.
      target: either a tensor for labels or in multihead case, a dict of string
        to target tensor.
      features: features dict.
      name: Op name.

    Returns:
      Loss tensor.
    """
    target = target[self.name] if isinstance(target, dict) else target
    loss_unweighted = self._loss_fn(logits, target)

    weight_tensor = self.get_weight_tensor(features)
    if weight_tensor is None:
      return math_ops.reduce_mean(loss_unweighted, name=name)
    loss_weighted = self._weighted_loss(loss_unweighted, weight_tensor)
    return math_ops.reduce_mean(loss_weighted, name=name)
 def loss_wrapper(labels, logits, weight_tensor):
   if weight_tensor is None:
     weight_tensor = array_ops.ones(
         shape=[array_ops.shape(labels)[0], 1], dtype=dtypes.float32)
   weighted_loss, _ = loss_fn(labels, weight_tensor, logits)
   average_loss = math_ops.reduce_mean(weighted_loss)
   return average_loss, average_loss / math_ops.reduce_mean(weight_tensor)
  def testSampleConsistentStats(self):
    loc = np.float32([[-1., 1], [1, -1]])
    scale = np.float32([1., 0.5])
    n_samp = 1e4
    with self.test_session() as sess:
      ind = independent_lib.Independent(
          distribution=mvn_diag_lib.MultivariateNormalDiag(
              loc=loc,
              scale_identity_multiplier=scale),
          reduce_batch_ndims=1)

      x = ind.sample(int(n_samp), seed=42)
      sample_mean = math_ops.reduce_mean(x, axis=0)
      sample_var = math_ops.reduce_mean(
          math_ops.squared_difference(x, sample_mean), axis=0)
      sample_std = math_ops.sqrt(sample_var)
      sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0)

      [
          sample_mean_, sample_var_, sample_std_, sample_entropy_,
          actual_mean_, actual_var_, actual_std_, actual_entropy_,
          actual_mode_,
      ] = sess.run([
          sample_mean, sample_var, sample_std, sample_entropy,
          ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(),
      ])

      self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.)
      self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.)
      self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
 def testCovarianceFromSampling(self):
   alpha = np.array([[1., 2, 3],
                     [2.5, 4, 0.01]], dtype=np.float32)
   with self.test_session() as sess:
     dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
     x = dist.sample(int(250e3), seed=1)
     sample_mean = math_ops.reduce_mean(x, 0)
     x_centered = x - sample_mean[None, ...]
     sample_cov = math_ops.reduce_mean(math_ops.matmul(
         x_centered[..., None], x_centered[..., None, :]), 0)
     sample_var = array_ops.matrix_diag_part(sample_cov)
     sample_stddev = math_ops.sqrt(sample_var)
     [
         sample_mean_,
         sample_cov_,
         sample_var_,
         sample_stddev_,
         analytic_mean,
         analytic_cov,
         analytic_var,
         analytic_stddev,
     ] = sess.run([
         sample_mean,
         sample_cov,
         sample_var,
         sample_stddev,
         dist.mean(),
         dist.covariance(),
         dist.variance(),
         dist.stddev(),
     ])
     self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
     self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
     self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
     self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
def _reduce_variance(x, axis=None, biased=True, keepdims=False):
  with ops.name_scope("reduce_variance"):
    x = ops.convert_to_tensor(x, name="x")
    mean = math_ops.reduce_mean(x, axis=axis, keepdims=True)
    biased_var = math_ops.reduce_mean(
        math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims)
    if biased:
      return biased_var
    n = _axis_size(x, axis)
    return (n / (n - 1.)) * biased_var
def mean_only_frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model from activations.

  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

                                |m - m_w|^2

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  In this variant, we only compute the difference between the means of the
  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
  still retains much of the same information as FID.

  Args:
    real_activations: 2D array of activations of real images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.
    generated_activations: 2D array of activations of generated images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.

  Returns:
    The mean-only Frechet Inception distance. A floating-point scalar of the
    same type as the output of the activations.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.cast(real_activations, dtypes.float64)
    generated_activations = math_ops.cast(generated_activations, dtypes.float64)

  # Compute means of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_w = math_ops.reduce_mean(generated_activations, 0)

  # Next the distance between means.
  mean = math_ops.reduce_sum(
      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
  mofid = mean
  if activations_dtype != dtypes.float64:
    mofid = math_ops.cast(mofid, activations_dtype)

  return mofid
Beispiel #14
0
def _loss(loss_unweighted, weight, name):
  """Returns loss."""
  if weight is None:
    loss = math_ops.reduce_mean(loss_unweighted, name=name)
    return loss, loss
  loss_weighted = _weighted_loss(loss_unweighted, weight)
  weighted_average_loss = math_ops.div(
      math_ops.reduce_sum(loss_weighted),
      math_ops.to_float(math_ops.reduce_sum(weight)),
      name="weighted_average_loss")
  loss = math_ops.reduce_mean(loss_weighted, name=name)
  return loss, weighted_average_loss
def classifier_score(images, classifier_fn, num_batches=1):
  """Classifier score for evaluating a conditional generative model.

  This is based on the Inception Score, but for an arbitrary classifier.

  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
  summary, this function calculates

  exp( E[ KL(p(y|x) || p(y)) ] )

  which captures how different the network's classification prediction is from
  the prior distribution over classes.

  Args:
    images: Images to calculate the classifier score for.
    classifier_fn: A function that takes images and produces logits based on a
      classifier.
    num_batches: Number of batches to split `generated_images` in to in order to
      efficiently run them through the classifier network.

  Returns:
    The classifier score. A floating-point scalar.
  """
  generated_images_list = array_ops.split(
      images, num_or_size_splits=num_batches)

  # Compute the classifier splits using the memory-efficient `map_fn`.
  logits = functional_ops.map_fn(
      fn=classifier_fn,
      elems=array_ops.stack(generated_images_list),
      parallel_iterations=1,
      back_prop=False,
      swap_memory=True,
      name='RunClassifier')
  logits = array_ops.concat(array_ops.unstack(logits), 0)
  logits.shape.assert_has_rank(2)

  # Use maximum precision for best results.
  logits_dtype = logits.dtype
  if logits_dtype != dtypes.float64:
    logits = math_ops.cast(logits, dtypes.float64)

  p = nn_ops.softmax(logits)
  q = math_ops.reduce_mean(p, axis=0)
  kl = _kl_divergence(p, logits, q)
  kl.shape.assert_has_rank(1)
  log_score = math_ops.reduce_mean(kl)
  final_score = math_ops.exp(log_score)

  if logits_dtype != dtypes.float64:
    final_score = math_ops.cast(final_score, dtypes.float64)
  return final_score
Beispiel #16
0
def _loss(loss_unweighted, weight, name):
  """Returns a tuple of (loss, weighted_average_loss)."""
  with ops.name_scope(name, values=(loss_unweighted, weight)) as name_scope:
    if weight is None:
      loss = math_ops.reduce_mean(loss_unweighted, name=name_scope)
      return loss, loss
    loss_weighted = _weighted_loss(loss_unweighted, weight)
    weighted_average_loss = math_ops.div(
        math_ops.reduce_sum(loss_weighted),
        math_ops.to_float(math_ops.reduce_sum(weight)),
        name="weighted_average_loss")
    loss = math_ops.reduce_mean(loss_weighted, name=name_scope)
    return loss, weighted_average_loss
  def testSample(self):
    with self.test_session():
      scale = make_pd(1., 2)
      df = 4

      chol_w = distributions.WishartCholesky(
          df, chol(scale), cholesky_input_output_matrices=False)

      x = chol_w.sample(1, seed=42).eval()
      chol_x = [chol(x[0])]

      full_w = distributions.WishartFull(
          df, scale, cholesky_input_output_matrices=False)
      self.assertAllClose(x, full_w.sample(1, seed=42).eval())

      chol_w_chol = distributions.WishartCholesky(
          df, chol(scale), cholesky_input_output_matrices=True)
      self.assertAllClose(chol_x, chol_w_chol.sample(1, seed=42).eval())
      eigen_values = array_ops.matrix_diag_part(
          chol_w_chol.sample(
              1000, seed=42))
      np.testing.assert_array_less(0., eigen_values.eval())

      full_w_chol = distributions.WishartFull(
          df, scale, cholesky_input_output_matrices=True)
      self.assertAllClose(chol_x, full_w_chol.sample(1, seed=42).eval())
      eigen_values = array_ops.matrix_diag_part(
          full_w_chol.sample(
              1000, seed=42))
      np.testing.assert_array_less(0., eigen_values.eval())

      # Check first and second moments.
      df = 4.
      chol_w = distributions.WishartCholesky(
          df=df,
          scale=chol(make_pd(1., 3)),
          cholesky_input_output_matrices=False)
      x = chol_w.sample(10000, seed=42)
      self.assertAllEqual((10000, 3, 3), x.get_shape())

      moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval()
      self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05)

      # The Variance estimate uses the squares rather than outer-products
      # because Wishart.Variance is the diagonal of the Wishart covariance
      # matrix.
      variance_estimate = (math_ops.reduce_mean(
          math_ops.square(x), reduction_indices=[0]) -
                           math_ops.square(moment1_estimate)).eval()
      self.assertAllClose(
          chol_w.variance().eval(), variance_estimate, rtol=0.05)
Beispiel #18
0
  def run_test_sample_consistent_mean_variance(
      self, sess_run_fn, dist,
      num_samples=int(1e5), seed=24,
      rtol=1e-2, atol=0.):
    """Tests that sample/mean/variance are consistent with each other.

    "Consistency" means that `sample`, `mean`, `variance`, etc all correspond
    to the same distribution.

    Args:
      sess_run_fn: Python `callable` taking `list`-like of `Tensor`s and
        returning a list of results after running one "step" of TensorFlow
        computation, typically set to `sess.run`.
      dist: Distribution instance or object which implements `sample`,
        `log_prob`, `event_shape_tensor` and `batch_shape_tensor`.
      num_samples: Python `int` scalar indicating the number of Monte-Carlo
        samples to draw from `dist`.
      seed: Python `int` indicating the seed to use when sampling from `dist`.
        In general it is not recommended to use `None` during a test as this
        increases the likelihood of spurious test failure.
      rtol: Python `float`-type indicating the admissible relative error between
        analytical and sample statistics.
      atol: Python `float`-type indicating the admissible absolute error between
        analytical and sample statistics.
    """
    x = math_ops.cast(dist.sample(num_samples, seed=seed), dtypes.float32)
    sample_mean = math_ops.reduce_mean(x, axis=0)
    sample_variance = math_ops.reduce_mean(
        math_ops.square(x - sample_mean), axis=0)
    sample_stddev = math_ops.sqrt(sample_variance)

    [
        sample_mean_,
        sample_variance_,
        sample_stddev_,
        mean_,
        variance_,
        stddev_
    ] = sess_run_fn([
        sample_mean,
        sample_variance,
        sample_stddev,
        dist.mean(),
        dist.variance(),
        dist.stddev(),
    ])

    self.assertAllClose(mean_, sample_mean_, rtol=rtol, atol=atol)
    self.assertAllClose(variance_, sample_variance_, rtol=rtol, atol=atol)
    self.assertAllClose(stddev_, sample_stddev_, rtol=rtol, atol=atol)
 def testSampleUnbiasedScalarBatch(self):
   with self.test_session() as sess:
     dist = ds.DirichletMultinomial(
         total_count=5.,
         concentration=1. + 2. * self._rng.rand(4).astype(np.float32))
     n = int(5e3)
     x = dist.sample(n, seed=0)
     sample_mean = math_ops.reduce_mean(x, 0)
     x_centered = x - sample_mean  # Already transposed to [n, 2].
     sample_covariance = math_ops.matmul(
         x_centered, x_centered, adjoint_a=True) / n
     [
         sample_mean_,
         sample_covariance_,
         actual_mean_,
         actual_covariance_,
     ] = sess.run([
         sample_mean,
         sample_covariance,
         dist.mean(),
         dist.covariance(),
     ])
     self.assertAllEqual([4], sample_mean.get_shape())
     self.assertAllClose(actual_mean_, sample_mean_, atol=0., rtol=0.05)
     self.assertAllEqual([4, 4], sample_covariance.get_shape())
     self.assertAllClose(
         actual_covariance_, sample_covariance_, atol=0., rtol=0.15)
Beispiel #20
0
  def testReuse(self):

    def f(x):
      return core_layers.dense(x, self.CHANNELS // 2)

    def g(x):
      return core_layers.dense(x, self.CHANNELS // 2)

    x = random_ops.random_uniform(
        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
    x1, x2 = array_ops.split(x, 2, axis=-1)

    with variable_scope.variable_scope("test"):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_before = len(variables.global_variables())

    with variable_scope.variable_scope("test", reuse=True):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_after = len(variables.global_variables())
    self.assertEqual(num_vars_before, num_vars_after)

    loss = math_ops.reduce_mean(y1 + y2)
    _ = gradients_impl.gradients(loss,
                                 [x] + variables.trainable_variables())

    with variable_scope.variable_scope("test", reuse=True):
      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)

    num_vars_after = len(variables.global_variables())
    self.assertEqual(num_vars_before, num_vars_after)
Beispiel #21
0
 def loss_fn(labels, logits, weights=None):
   result = losses.per_example_maxent_loss(
       labels=labels,
       logits=logits,
       weights=weights,
       num_classes=n_classes)
   return math_ops.reduce_mean(result[0])
Beispiel #22
0
 def monte_carlo_hypersphere_volume(dist, num_samples, radius, center):
   # https://en.wikipedia.org/wiki/Importance_sampling
   x = dist.sample(num_samples, seed=seed)
   x = array_ops.identity(x)  # Invalidate bijector cacheing.
   return math_ops.reduce_mean(
       math_ops.exp(-dist.log_prob(x)) * is_in_ball(x, radius, center),
       axis=0)
  def testQuadraticLoss(self):
    """Statistical test for the gradient.

    The equation (5) of https://arxiv.org/abs/1805.08498 says
      d/dalpha E_{sample ~ Gamma(alpha, 1)} f(sample)
        = E_{sample ~ Gamma(alpha, 1)} df(sample)/dalpha.

    Choose a quadratic loss function f(sample) = (sample - t)^2.
    Then, the lhs can be computed analytically:
      d/dalpha E_{sample ~ Gamma(alpha, 1)} f(sample)
        = d/dalpha [ (alpha + alpha^2) - 2 * t * alpha + t^2 ]
        = 1 + 2 * alpha - 2 * t.

    We compare the Monte-Carlo estimate of the expectation with the
    true gradient.
    """
    num_samples = 1000
    t = 0.3
    alpha = 0.5
    expected = 1 + 2 * alpha - 2 * t

    alpha = constant_op.constant(alpha)
    sample = random_ops.random_gamma([num_samples], alpha, 1.0)
    loss = math_ops.reduce_mean(math_ops.square(sample - t))
    dloss_dalpha = gradients_impl.gradients(loss, alpha)[0]
    dloss_dalpha_val = self.evaluate(dloss_dalpha)
    self.assertAllClose(expected, dloss_dalpha_val, atol=1e-1, rtol=1e-1)
Beispiel #24
0
 def create_loss(self, features, mode, logits, labels):
   """See `Head`."""
   del mode  # Unused for this head.
   logits = ops.convert_to_tensor(logits)
   processed_labels = self._process_labels(labels)
   processed_labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint:disable=protected-access
       labels=processed_labels, logits=logits,
       expected_labels_dimension=self.logits_dimension)
   if self._loss_fn:
     unweighted_loss = head_lib._call_loss_fn(  # pylint:disable=protected-access
         loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
         features=features, expected_loss_dim=1)
   else:
     unweighted_loss = losses.sigmoid_cross_entropy(
         multi_class_labels=processed_labels, logits=logits,
         reduction=losses.Reduction.NONE)
     # Averages loss over classes.
     unweighted_loss = math_ops.reduce_mean(
         unweighted_loss, axis=-1, keepdims=True)
   weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
       features=features, weight_column=self._weight_column, logits=logits)
   training_loss = losses.compute_weighted_loss(
       unweighted_loss, weights=weights, reduction=self._loss_reduction)
   return head_lib.LossSpec(
       training_loss=training_loss,
       unreduced_loss=unweighted_loss,
       weights=weights,
       processed_labels=processed_labels)
Beispiel #25
0
  def testGradient(self):
    with ops.Graph().as_default() as g:
      inputs = array_ops.placeholder(
          dtypes.float32, shape=[None, 100], name="input")
      weights = array_ops.placeholder(
          dtypes.float32, shape=[100, 10], name="weights")
      biases = array_ops.placeholder(dtypes.float32, shape=[10], name="biases")
      activations = nn_ops.relu(
          math_ops.matmul(inputs, weights) + biases, name="activations")
      loss = math_ops.reduce_mean(activations, name="loss")
    gdef = g.as_graph_def()

    with ops.Graph().as_default() as g:
      input_placeholder = array_ops.placeholder(dtypes.float32, shape=[32, 100])
      weights_var = variables.Variable(
          random_ops.truncated_normal([100, 10]), name="weights")
      biases_var = variables.Variable(array_ops.zeros([10]), name="biases")
      activations, loss = importer.import_graph_def(
          gdef,
          input_map={
              "input:0": input_placeholder,
              "weights:0": weights_var,
              "biases:0": biases_var
          },
          return_elements=["activations:0", "loss:0"])
      self.assertEqual([32, 10], activations.get_shape())
      self.assertEqual([], loss.get_shape())
      weights_grad, biases_grad = gradients_impl.gradients(
          loss, [weights_var, biases_var])
      self.assertEqual([100, 10], weights_grad.get_shape())
      self.assertEqual([10], biases_grad.get_shape())
Beispiel #26
0
def zero_fraction(value, name=None):
  """Returns the fraction of zeros in `value`.

  If `value` is empty, the result is `nan`.

  This is useful in summaries to measure and report sparsity.  For example,

  ```python
      z = tf.Relu(...)
      summ = tf.contrib.deprecated.scalar_summary('sparsity',
      tf.nn.zero_fraction(z))
  ```

  Args:
    value: A tensor of numeric type.
    name: A name for the operation (optional).

  Returns:
    The fraction of zeros in `value`, with type `float32`.
  """
  with ops.name_scope(name, "zero_fraction", [value]):
    value = ops.convert_to_tensor(value, name="value")
    zero = constant_op.constant(0, dtype=value.dtype, name="zero")
    return math_ops.reduce_mean(
        math_ops.cast(math_ops.equal(value, zero), dtypes.float32))
  def _train(self, checkpoint_path, layout_optimizer=False, restore=False):
    ops.reset_default_graph()
    graph = ops.get_default_graph()
    with session.Session(
        config=get_config(layout_optimizer), graph=graph) as sess:
      batch = 2
      height = 6
      width = 7
      input_channels = 3
      shape = [batch, height, width, input_channels]
      image = array_ops.placeholder(dtype='float32', shape=shape)
      conv1 = conv_layers.conv2d(image, 32, [3, 3])
      conv2 = conv_layers.conv2d(conv1, 32, [3, 3])
      optimizer = gradient_descent.GradientDescentOptimizer(0.01)
      loss = math_ops.reduce_mean(conv2)
      train_op = optimizer.minimize(loss)
      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)

      if restore:
        saver.restore(sess, checkpoint_path)
      else:
        sess.run(variables.global_variables_initializer())

      np.random.seed(0)
      for _ in range(2):
        image_val = np.random.rand(*shape).astype(np.float32)
        sess.run([loss, train_op], feed_dict={image: image_val})

      if restore:
        all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        all_vars_values = [var.eval(session=sess) for var in all_vars]
        return all_vars_values
      else:
        saver.save(sess, checkpoint_path)
  def testGradient(self):
    if not test.is_gpu_available(cuda_only=True):
      self.skipTest('GPU required')

    random_seed.set_random_seed(0)
    x = random_ops.truncated_normal([1, 200, 200, 3], seed=0)
    y = conv_layers.conv2d(x, 32, [3, 3])
    z = conv_layers.conv2d(y, 32, [3, 3])
    optimizer = gradient_descent.GradientDescentOptimizer(1e-4)
    loss = math_ops.reduce_mean(z)
    train_op = optimizer.minimize(loss)
    graph = ops.get_default_graph()
    graph.add_to_collection('train_op', train_op)
    meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def())

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        optimize_tensor_layout=True)
    optimized_graph = tf_optimizer.OptimizeGraph(rewrite_options, meta_graph)

    found = 0
    for node in optimized_graph.node:
      if node.op in ['Conv2D', 'Conv2DBackpropFilter', 'Conv2DBackpropInput']:
        found += 1
        self.assertEqual(node.attr['data_format'].s, 'NCHW')
    self.assertEqual(found, 5)
 def testSampleUnbiasedNonScalarBatch(self):
   with self.test_session() as sess:
     dist = ds.DirichletMultinomial(
         total_count=5.,
         concentration=1. + 2. * self._rng.rand(4, 3, 2).astype(np.float32))
     n = int(3e3)
     x = dist.sample(n, seed=0)
     sample_mean = math_ops.reduce_mean(x, 0)
     # Cyclically rotate event dims left.
     x_centered = array_ops.transpose(x - sample_mean, [1, 2, 3, 0])
     sample_covariance = math_ops.matmul(
         x_centered, x_centered, adjoint_b=True) / n
     [
         sample_mean_,
         sample_covariance_,
         actual_mean_,
         actual_covariance_,
     ] = sess.run([
         sample_mean,
         sample_covariance,
         dist.mean(),
         dist.covariance(),
     ])
     self.assertAllEqual([4, 3, 2], sample_mean.get_shape())
     self.assertAllClose(actual_mean_, sample_mean_, atol=0., rtol=0.15)
     self.assertAllEqual([4, 3, 2, 2], sample_covariance.get_shape())
     self.assertAllClose(
         actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
  def testCustomGrad(self):

    def fn(a, b, c):
      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)

    def grad_fn(inputs, trainable_variables, unused_outputs,
                unused_grad_outputs):
      grad_inputs = [
          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
      ]
      grad_vars = [
          array_ops.ones_like(t) * (i + len(inputs) + 1.)
          for i, t in enumerate(trainable_variables)
      ]
      return grad_inputs, grad_vars

    a = random_ops.random_uniform([11, 6])
    b = random_ops.random_uniform([11, 7])
    c = random_ops.random_uniform([7, 10])
    w = random_ops.random_uniform([6, 10])
    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
    loss = math_ops.reduce_mean(out)
    grads = gradients_impl.gradients(
        loss, [a, b, c, variables.trainable_variables()[0]])
    expected_grads = [
        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
    ]
    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      g_val, eg_val = sess.run([grads, expected_grads])
      for g1, g2 in zip(g_val, eg_val):
        self.assertAllClose(g1, g2)
class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
                      parameterized.TestCase):
    @parameterized.parameters([
        # The following test sets map over a RaggedTensor and apply a
        # transformation that returns with shape:
        # [d1, (d2)] -> [d1]
        dict(
            fn=mo.reduce_mean,
            elems=[[1, 2, 3], [4, 5], [6, 7]],
            expected_output=[2, 4, 6],
        ),
        dict(
            fn=string_ops.reduce_join,
            elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']],
            expected_output=[b'foobarbaz', b'a', b'bc'],
            dtype=dtypes.string,
        ),
        # [d1, (d2)] -> [d1, 2]
        dict(
            fn=lambda x: array_ops.stack([mo.reduce_mean(x),
                                          mo.reduce_sum(x)]),
            # fn=self.stack_mean_and_sum,
            elems=[[1, 2, 3], [4, 5], [6, 7]],
            expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
            dtype=dtypes.float32,
            expected_ragged_rank=0,
        ),
        # [d1, (d2)] -> [d1, (d2)]
        dict(
            fn=lambda x: x + np.int64(1),
            elems=[[1, 2, 3], [4, 5], [6, 7]],
            expected_output=[[2, 3, 4], [5, 6], [7, 8]],
            dtype=dtypes.int64,
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=1),
        ),
        # [d1, (d2), d3] -> [d1, (d2), d3]
        dict(
            fn=lambda x: x + np.int64(1),
            elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
            elems_ragged_rank=1,
            expected_ragged_rank=1,
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=1),
            expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
        ),
        # [d1, (d2)] -> [d1, (d2), (d3)]
        dict(
            fn=lambda x: ragged.RaggedTensor.from_row_starts(x, [0]),
            elems=[[1, 2, 3], [4, 5], [6, 7]],
            expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=2),
        ),
        # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
        dict(
            fn=lambda x: ragged.map_flat_values(mo.add, x, 1),
            elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
            expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=2),
        ),
        # [d1, (d2), (d3)] -> [d1, (d2)]
        dict(
            fn=lambda x: ragged.reduce_sum(x, axis=1),
            elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
            expected_output=[[6], [9, 13]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=1),
        ),
        # [d1, (d2), (d3)] -> [d1, (d3)]
        dict(
            fn=lambda x: ragged.reduce_sum(x, axis=0),
            elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
            expected_output=[[1, 2, 3], [10, 12]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=1),
        ),
        # [d1, (d2), (d3)] -> [d1]
        dict(
            fn=ragged.reduce_sum,
            elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
            expected_output=[6, 22],
            result_dtype=dtypes.int64,
        ),
        # [d1] -> [d1, (d2)]
        dict(
            fn=mo.range,
            elems=[4, 0, 2],
            expected_output=[[0, 1, 2, 3], [], [0, 1]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=1),
        ),
        # [d1] -> [d1, (d2), (d3)]
        dict(
            fn=lambda x: ragged.range(mo.range(x)),
            elems=[5, 0, 3],
            expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [],
                             [[], [0], [0, 1]]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=2),
        ),
        # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
        dict(
            fn=lambda x: x + np.int64(1),
            elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
            expected_output=[[[[[2, 3, 4]], [[5], [6]]]],
                             [[[[7, 8]]], [[[9], []]]]],
            result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                 ragged_rank=4),
        ),
    ])
    def testRaggedMap(
        self,
        fn,
        elems,
        expected_output,
        expected_ragged_rank=None,
        result_ragged_rank=None,
        elems_ragged_rank=None,
        dtype=dtypes.int64,
        result_dtype=None,
        infer_shape=False,
    ):
        elems = ragged.constant(elems, dtype, elems_ragged_rank)
        output = ragged.map_fn(fn=fn,
                               elems=elems,
                               dtype=result_dtype,
                               infer_shape=infer_shape)

        expected_rt = ragged.constant(expected_output,
                                      ragged_rank=expected_ragged_rank)
        self.assertRaggedEqual(expected_rt, output)

    def testRaggedMapOnStructure(self):
        batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
        # [[10, 20, 30], [40], [50, 60, 70]]
        robin = ragged.map_flat_values(mo.multiply, batman, 10)

        features = {'batman': batman, 'robin': robin}

        def _reduce_sum_from_all(f):
            return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])

        output = ragged.map_fn(
            fn=_reduce_sum_from_all,
            elems=features,
            dtype=dtypes.int32,
        )

        self.assertRaggedEqual(output, [66, 44, 198])

    # Test mapping over a dict of RTs can produce a dict of RTs.
    def testRaggedMapOnStructure_RaggedOutputs(self):
        batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
        # [[10, 20, 30], [40], [50, 60, 70]]
        robin = ragged.map_flat_values(mo.multiply, batman, 10)

        features = {'batman': batman, 'robin': robin}

        def _increment(f):
            return {
                'batman': f['batman'] + 1,
                'robin': f['robin'] + 1,
            }

        output = ragged.map_fn(
            fn=_increment,
            elems=features,
            infer_shape=False,
            dtype={
                'batman':
                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
                'robin':
                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
            },
        )

        self.assertRaggedEqual(output['batman'], [[2, 3, 4], [5], [6, 7, 8]])
        self.assertRaggedEqual(output['robin'],
                               [[11, 21, 31], [41], [51, 61, 71]])

    def testZip(self):
        x = ragged.constant(
            [[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]], dtypes.int64)
        y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64),
                                  axis=1)

        def _zip(foo):
            y_val, x_val = foo
            bar = backend.tile(y_val, array_ops.shape(x_val))
            return array_ops.stack([bar, x_val], axis=1)

        output = ragged.map_fn(_zip, (y, x),
                               dtype=ragged.RaggedTensorType(
                                   dtype=dtypes.int64, ragged_rank=1),
                               infer_shape=False)

        self.assertRaggedEqual(
            output,
            [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
             [[3, 70]], [[4, 80], [4, 90], [4, 100]]])

    def testBatchGather(self):
        tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
                                  ['bonjour', '.', 'ca va', '?']])
        indices = ragged.constant([[0, 2], [0], [0, 2]])

        def gather(x):
            tokens_val, indices_val = x
            return array_ops.gather(tokens_val, indices_val)

        data = tokens, indices
        out = ragged.map_fn(gather,
                            data,
                            dtype=ragged.RaggedTensorType(dtype=dtypes.string,
                                                          ragged_rank=1),
                            infer_shape=False)

        self.assertRaggedEqual(
            out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])

    def testMismatchRaggedRank(self):
        elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
        fn = lambda x: ragged.reduce_sum(x, axis=0)
        with self.assertRaisesWithLiteralMatch(
                ValueError,
                r'The declared ragged rank (23) mismatches the result (1)'):
            _ = ragged.map_fn(fn,
                              elems,
                              dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                            ragged_rank=23))

    def testMismatchRaggedRank2(self):
        elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
        fn = lambda x: ragged.RaggedTensor.from_row_starts(x, [0])
        with self.assertRaisesWithLiteralMatch(
                ValueError,
                r'The declared ragged rank (10) mismatches the result (1)'):
            _ = ragged.map_fn(fn,
                              elems,
                              dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
                                                            ragged_rank=10))

    def testMapOnSparseTensor(self):
        s = sparse_tensor.SparseTensor(
            indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
            values=[0, 5, 0, 4],
            dense_shape=[2, 2],
        )
        t2 = ragged.RaggedTensor.from_sparse(s)
        id_t2 = ragged.map_fn(
            lambda x: x,
            t2,
        )
        self.assertRaggedEqual(id_t2, [[0, 5], [0, 4]])
def frechet_classifier_distance_from_activations(real_activations,
                                                 generated_activations):
    """Classifier distance for evaluating a generative model.

  This methods computes the Frechet classifier distance from activations of
  real images and generated images. This can be used independently of the
  frechet_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like precompute all of the
  activations before computing the classifier distance.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calculates

                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].

  Returns:
   The Frechet Inception distance. A floating-point scalar of the same type
   as the output of the activations.

  """
    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.cast(real_activations, dtypes.float64)
        generated_activations = math_ops.cast(generated_activations,
                                              dtypes.float64)

    # Compute mean and covariance matrices of activations.
    m = math_ops.reduce_mean(real_activations, 0)
    m_w = math_ops.reduce_mean(generated_activations, 0)
    num_examples_real = math_ops.cast(
        array_ops.shape(real_activations)[0], dtypes.float64)
    num_examples_generated = math_ops.cast(
        array_ops.shape(generated_activations)[0], dtypes.float64)

    # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
    real_centered = real_activations - m
    sigma = math_ops.matmul(real_centered, real_centered,
                            transpose_a=True) / (num_examples_real - 1)

    gen_centered = generated_activations - m_w
    sigma_w = math_ops.matmul(gen_centered, gen_centered,
                              transpose_a=True) / (num_examples_generated - 1)

    # Find the Tr(sqrt(sigma sigma_w)) component of FID
    sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component

    # Next the distance between means.
    mean = math_ops.reduce_sum(math_ops.squared_difference(
        m, m_w))  # Equivalent to L2 but more stable.
    fid = trace + mean
    if activations_dtype != dtypes.float64:
        fid = math_ops.cast(fid, activations_dtype)

    return fid
def kernel_classifier_distance_and_std_from_activations(
        real_activations,
        generated_activations,
        max_block_size=1024,
        dtype=None):
    """Kernel "classifier" distance for evaluating a generative model.

  This methods computes the kernel classifier distance from activations of
  real images and generated images. This can be used independently of the
  kernel_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like to precompute all of the
  activations before computing the classifier distance, or if we want to
  compute multiple metrics based on the same images. It also returns a rough
  estimate of the standard error of the estimator.

  This technique is described in detail in https://arxiv.org/abs/1801.01401.
  Given two distributions P and Q of activations, this function calculates

      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]

  where k is the polynomial kernel

      k(x, y) = ( x^T y / dimension + 1 )^3.

  This captures how different the distributions of real and generated images'
  visual features are. Like the Frechet distance (and unlike the Inception
  score), this is a true distance and incorporates information about the
  target images. Unlike the Frechet score, this function computes an
  *unbiased* and asymptotically normal estimator, which makes comparing
  estimates across models much more intuitive.

  The estimator used takes time quadratic in max_block_size. Larger values of
  max_block_size will decrease the variance of the estimator but increase the
  computational cost. This differs slightly from the estimator used by the
  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
  The estimate of the standard error will also be more reliable when there are
  more blocks, i.e. when max_block_size is smaller.

  NOTE: the blocking code assumes that real_activations and
  generated_activations are both in random order. If either is sorted in a
  meaningful order, the estimator will behave poorly.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].
    max_block_size: integer, default 1024. The distance estimator splits samples
      into blocks for computational efficiency. Larger values are more
      computationally expensive but decrease the variance of the distance
      estimate. Having a smaller block size also gives a better estimate of the
      standard error.
    dtype: If not None, coerce activations to this dtype before computations.

  Returns:
   The Kernel Inception Distance. A floating-point scalar of the same type
     as the output of the activations.
   An estimate of the standard error of the distance estimator (a scalar of
     the same type).
  """

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)
    real_activations.shape[1].assert_is_compatible_with(
        generated_activations.shape[1])

    if dtype is None:
        dtype = real_activations.dtype
        assert generated_activations.dtype == dtype
    else:
        real_activations = math_ops.cast(real_activations, dtype)
        generated_activations = math_ops.cast(generated_activations, dtype)

    # Figure out how to split the activations into blocks of approximately
    # equal size, with none larger than max_block_size.
    n_r = array_ops.shape(real_activations)[0]
    n_g = array_ops.shape(generated_activations)[0]

    n_bigger = math_ops.maximum(n_r, n_g)
    n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size),
                             dtypes.int32)

    v_r = n_r // n_blocks
    v_g = n_g // n_blocks

    n_plusone_r = n_r - v_r * n_blocks
    n_plusone_g = n_g - v_g * n_blocks

    sizes_r = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_r], v_r),
        array_ops.fill([n_plusone_r], v_r + 1),
    ], 0)
    sizes_g = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_g], v_g),
        array_ops.fill([n_plusone_g], v_g + 1),
    ], 0)

    zero = array_ops.zeros([1], dtype=dtypes.int32)
    inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
    inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

    dim = math_ops.cast(real_activations.shape[1], dtype)

    def compute_kid_block(i):
        """Computes the ith block of the KID estimate."""
        r_s = inds_r[i]
        r_e = inds_r[i + 1]
        r = real_activations[r_s:r_e]
        m = math_ops.cast(r_e - r_s, dtype)

        g_s = inds_g[i]
        g_e = inds_g[i + 1]
        g = generated_activations[g_s:g_e]
        n = math_ops.cast(g_e - g_s, dtype)

        k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
        k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
        k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
        return (-2 * math_ops.reduce_mean(k_rg) +
                (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) /
                (m * (m - 1)) +
                (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n *
                                                                      (n - 1)))

    ests = map_fn.map_fn(compute_kid_block,
                         math_ops.range(n_blocks),
                         dtype=dtype,
                         back_prop=False)

    mn = math_ops.reduce_mean(ests)

    # nn_impl.moments doesn't use the Bessel correction, which we want here
    n_blocks_ = math_ops.cast(n_blocks, dtype)
    var = control_flow_ops.cond(
        math_ops.less_equal(n_blocks, 1),
        lambda: array_ops.constant(float('nan'), dtype=dtype),
        lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) /
        (n_blocks_ - 1))

    return mn, math_ops.sqrt(var / n_blocks_)
 def _tf_reduce(self, x, reduction_axes, keepdims):
     return math_ops.reduce_mean(x, reduction_axes, keepdims)
 def worker_train_fn():
   y = random_ops.random_uniform((10, 2))
   return math_ops.reduce_mean(math_ops.matmul(v, y))
Beispiel #36
0
def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default)
        or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  x_dtype = x.dtype.base_dtype
  if x_dtype == dtypes.float16:
    # float16 math is too imprecise, so we do the batch norm gradient
    # computations in float32.
    x = math_ops.cast(x, dtypes.float32)
    grad_y = math_ops.cast(grad_y, dtypes.float32)
  if is_training:
    if data_format == b"NHWC":
      keepdims = False
      reduce_axis = [0, 1, 2]
    else:
      keepdims = True
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(scale), 1, 1]
      scale = array_ops.reshape(scale, shape)
    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
    var_x = math_ops.reduce_mean(
        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
        reduce_axis,
        keepdims=keepdims)
    grad_y_offset = grad_y - mean_grad_y
    x_offset = x - mean_x
    mean = math_ops.reduce_mean(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    if data_format == b"NCHW":
      grad_scale = array_ops.squeeze(grad_scale)
    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
  else:
    if data_format == b"NHWC":
      reduce_axis = [0, 1, 2]
    else:
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(pop_mean), 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)

    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
    grad_scale = math_ops.reduce_sum(
        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
    grad_x = grad_y * scale * var_rsqrt
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
Beispiel #37
0
 def regularizer2(v):
   return math_ops.reduce_mean(v) + 0.2
Beispiel #38
0
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              reuse=None,
              scope=None):
  """Generator for v2 (preactivation) ResNet models.

  This function generates a family of ResNet v2 models. See the resnet_v2_*()
  methods for specific model instantiations, obtained by selecting different
  block instantiations that produce ResNets of various depths.

  Training for image classification on Imagenet is usually done with [224, 224]
  inputs, resulting in [7, 7] feature maps at the output of the last ResNet
  block for the ResNets defined in [1] that have nominal stride equal to 32.
  However, for dense prediction tasks we advise that one uses inputs with
  spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
  this case the feature maps at the ResNet output will have spatial shape
  [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
  and corners exactly aligned with the input image corners, which greatly
  facilitates alignment of the features to the image. Using as input [225, 225]
  images results in [8, 8] feature maps at the output of the last ResNet block.

  For dense prediction tasks, the ResNet needs to run in fully-convolutional
  (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
  have nominal stride equal to 32 and a good choice in FCN mode is to use
  output_stride=16 in order to increase the density of the computed features at
  small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.

  Args:
    inputs: A tensor of size [batch, height_in, width_in, channels].
    blocks: A list of length equal to the number of ResNet blocks. Each element
      is a resnet_utils.Block object describing the units in the block.
    num_classes: Number of predicted classes for classification tasks. If None
      we return the features before the logit layer.
    is_training: whether batch_norm layers are in training mode.
    global_pool: If True, we perform global average pooling before computing the
      logits. Set to True for image classification, False for dense prediction.
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
    include_root_block: If True, include the initial convolution followed by
      max-pooling, if False excludes it. If excluded, `inputs` should be the
      results of an activation-less convolution.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.


  Returns:
    net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
      If global_pool is False, then height_out and width_out are reduced by a
      factor of output_stride compared to the respective height_in and width_in,
      else both height_out and width_out equal one. If num_classes is None, then
      net is the output of the last ResNet block, potentially after global
      average pooling. If num_classes is not None, net contains the pre-softmax
      activations.
    end_points: A dictionary from components of the network to the corresponding
      activation.

  Raises:
    ValueError: If the target output_stride is not valid.
  """
  with variable_scope.variable_scope(
      scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with arg_scope(
        [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
        outputs_collections=end_points_collection):
      with arg_scope([layers.batch_norm], is_training=is_training):
        net = inputs
        if include_root_block:
          if output_stride is not None:
            if output_stride % 4 != 0:
              raise ValueError('The output_stride needs to be a multiple of 4.')
            output_stride /= 4
          # We do not include batch normalization or activation functions in
          # conv1 because the first ResNet unit will perform these. Cf.
          # Appendix of [2].
          with arg_scope(
              [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
            net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
          net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
        # This is needed because the pre-activation variant does not have batch
        # normalization or activation functions in the residual unit output. See
        # Appendix of [2].
        net = layers.batch_norm(
            net, activation_fn=nn_ops.relu, scope='postnorm')
        if global_pool:
          # Global average pooling.
          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)

        net = layers_lib.dropout(net, 0.5, is_training=is_training, scope='dropout')

        if num_classes is not None:
          net = layers_lib.conv2d(
              net,
              num_classes, [1, 1],
              activation_fn=None,
              normalizer_fn=None,
              scope='logits')
        # Convert end_points_collection into a dictionary of end_points.
        end_points = utils.convert_collection_to_dict(end_points_collection)
        if num_classes is not None:
          end_points['predictions'] = layers.softmax(net, scope='predictions')
        return net, end_points
Beispiel #39
0
def bottleneck(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               outputs_collections=None,
               scope=None):
  """Bottleneck residual unit variant with BN before convolutions.

  This is the full preactivation residual unit variant proposed in [2]. See
  Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck
  variant which has an extra bottleneck layer.

  When putting together two consecutive ResNet blocks that use this unit, one
  should use stride = 2 in the last unit of the first block.

  Args:
    inputs: A tensor of size [batch, height, width, channels].
    depth: The depth of the ResNet unit output.
    depth_bottleneck: The depth of the bottleneck layers.
    stride: The ResNet unit's stride. Determines the amount of downsampling of
      the units output compared to its input.
    rate: An integer, rate for atrous convolution.
    outputs_collections: Collection to add the ResNet unit output.
    scope: Optional variable_scope.

  Returns:
    The ResNet unit's output.
  """
  with variable_scope.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc:
    depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4)
    preact = layers.batch_norm(
        inputs, activation_fn=nn_ops.relu, scope='preact')
    if depth == depth_in:
      shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
    else:
      shortcut = layers_lib.conv2d(
          preact,
          depth, [1, 1],
          stride=stride,
          normalizer_fn=None,
          activation_fn=None,
          scope='shortcut')

    residual = layers_lib.conv2d(
        preact, depth_bottleneck, [1, 1], stride=1, scope='conv1')
    residual = resnet_utils.conv2d_same(
        residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2')
    residual = layers_lib.conv2d(
        residual,
        depth, [1, 1],
        stride=1,
        normalizer_fn=None,
        activation_fn=None,
        scope='conv3')
    with variable_scope.variable_scope('CBAM'):
        max_c = math_ops.reduce_max(residual, axis=[1, 2], name='max_c')
        max_c = layers_lib.fully_connected(max_c, int(depth / 8), nn_ops.relu, normalizer_fn=None,
                                           scope='share1', weights_regularizer=layers_lib.l2_regularizer(0.0001))
        max_c = layers_lib.fully_connected(max_c, depth, None, normalizer_fn=None,
                                           scope='share2', weights_regularizer=layers_lib.l2_regularizer(0.0001))

        avg_c = math_ops.reduce_mean(residual, axis=[1, 2], name='avg_c')
        avg_c = layers_lib.fully_connected(avg_c, int(depth / 8), nn_ops.relu, normalizer_fn=None,
                                           scope='share1', reuse=True)
        avg_c = layers_lib.fully_connected(avg_c, depth, None, normalizer_fn=None,
                                           scope='share2', reuse=True)

        Mc = math_ops.sigmoid(max_c + avg_c)
        Mc = array_ops.expand_dims(Mc, 1)
        Mc = array_ops.expand_dims(Mc, 1)
        residual = residual * Mc

        max_s = math_ops.reduce_max(residual, axis=-1, name='max_s', keep_dims=True)
        avg_s = math_ops.reduce_mean(residual, axis=-1, name='avg_s', keep_dims=True)
        Ms = array_ops.concat([avg_s, max_s], axis=-1)
        Ms = layers_lib.conv2d(Ms, 1, [1, 1], activation_fn=math_ops.sigmoid, normalizer_fn=None)
        residual = residual * Ms
    output = shortcut + residual

    return utils.collect_named_outputs(outputs_collections, sc.name, output)
Beispiel #40
0
    def _testRevBlock(self,
                      x=None,
                      f=None,
                      g=None,
                      f_side_input=None,
                      g_side_input=None):
        random_seed.set_random_seed(1234)

        if f is None:

            def f(x):  # pylint: disable=function-redefined
                return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)

        if g is None:

            def g(x):  # pylint: disable=function-redefined
                return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)

        if f_side_input is None:
            f_side_input = []

        if g_side_input is None:
            g_side_input = []

        if x is None:
            x = random_ops.random_uniform([self.BATCH_SIZE, self.CHANNELS],
                                          dtype=dtypes.float32)
        x1, x2 = array_ops.split(x, 2, axis=-1)

        with variable_scope.variable_scope("rev_test") as vs:
            y1_rev, y2_rev = rev_block_lib.rev_block(
                x1,
                x2,
                f,
                g,
                f_side_input=f_side_input,
                g_side_input=g_side_input,
                num_layers=self.NUM_LAYERS)
            y_rev = array_ops.concat([y1_rev, y2_rev], axis=1)
            fg_vars = vs.trainable_variables()

        num_vars = len(variables.global_variables())
        with variable_scope.variable_scope(vs, reuse=True):
            y1, y2 = rev_block_lib.rev_block(x1,
                                             x2,
                                             f,
                                             g,
                                             f_side_input=f_side_input,
                                             g_side_input=g_side_input,
                                             num_layers=self.NUM_LAYERS,
                                             is_training=False)
            y = array_ops.concat([y1, y2], axis=1)
        # Ensure no new vars were created - full reuse
        assert len(variables.global_variables()) == num_vars

        loss_rev = math_ops.reduce_mean(y_rev + 10.)
        loss = math_ops.reduce_mean(y + 10.)

        wrt = [x] + f_side_input + g_side_input + fg_vars
        grads_rev = gradients_impl.gradients(loss_rev, wrt)
        grads = gradients_impl.gradients(loss, wrt)

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            y_val, yd_val, gd_val, g_val = sess.run(
                [y, y_rev, grads_rev, grads])
            self.assertAllClose(y_val, yd_val)
            for g1, g2 in zip(gd_val, g_val):
                self.assertAllClose(g1, g2, rtol=1e-5)
Beispiel #41
0
  def get_batch_loss(self, features, mode, state):
    """Computes predictions and a loss.

    Args:
      features: A dictionary (such as is produced by a chunker) with the
        following key/value pairs (shapes are given as required for training):
          TrainEvalFeatures.TIMES: A [batch size, self.window_size] integer
            Tensor with times for each observation. To train on longer
            sequences, the data should first be chunked.
          TrainEvalFeatures.VALUES: A [batch size, self.window_size,
            self.num_features] Tensor with values for each observation.
        When evaluating, `TIMES` and `VALUES` must have a window size of at
        least self.window_size, but it may be longer, in which case the last
        window_size - self.input_window_size times (or fewer if this is not
        divisible by self.output_window_size) will be evaluated on with
        non-overlapping output windows (and will have associated
        predictions). This is primarily to support qualitative
        evaluation/plotting, and is not a recommended way to compute evaluation
        losses (since there is no overlap in the output windows, which for
        window-based models is an undesirable bias).
      mode: The tf.estimator.ModeKeys mode to use (TRAIN or EVAL).
      state: Unused
    Returns:
      A model.ModelOutputs object.
    Raises:
      ValueError: If `mode` is not TRAIN or EVAL, or if static shape information
      is incorrect.
    """
    features = {feature_name: ops.convert_to_tensor(feature_value)
                for feature_name, feature_value in features.items()}
    times = features[TrainEvalFeatures.TIMES]
    exogenous_regressors = self._process_exogenous_features(
        times=times,
        features={key: value for key, value in features.items()
                  if key not in [TrainEvalFeatures.TIMES,
                                 TrainEvalFeatures.VALUES,
                                 PredictionFeatures.STATE_TUPLE]})
    if mode == estimator_lib.ModeKeys.TRAIN:
      # For training, we require the window size to be self.window_size as
      # iterating sequentially on larger windows could introduce a bias.
      return self._process_window(
          features, mode=mode, exogenous_regressors=exogenous_regressors)
    elif mode == estimator_lib.ModeKeys.EVAL:
      # For evaluation, we allow the user to pass in a larger window, in which
      # case we try to cover as much of the window as possible without
      # overlap. Quantitative evaluation is more efficient/correct with fixed
      # windows matching self.window_size (as with training), but this looping
      # allows easy plotting of "in-sample" predictions.
      times.get_shape().assert_has_rank(2)
      static_window_size = times.get_shape().dims[1].value
      if (static_window_size is not None
          and static_window_size < self.window_size):
        raise ValueError(
            ("ARModel requires a window of at least input_window_size + "
             "output_window_size to evaluate on (input_window_size={}, "
             "output_window_size={}, and got shape {} for feature '{}' (batch "
             "size, window size)).").format(
                 self.input_window_size, self.output_window_size,
                 times.get_shape(), TrainEvalFeatures.TIMES))
      num_iterations = ((array_ops.shape(times)[1] -  self.input_window_size)
                        // self.output_window_size)
      output_size = num_iterations * self.output_window_size
      # Rather than dealing with overlapping windows of output, discard a bit at
      # the beginning if output windows don't cover evenly.
      crop_length = output_size + self.input_window_size
      features = {feature_name: feature_value[:, -crop_length:]
                  for feature_name, feature_value in features.items()}
      # Note that, unlike the ARModel's predict() while_loop, each iteration
      # here can run in parallel, since we are not feeding predictions or state
      # from previous iterations.
      def _while_condition(iteration_number, loss_ta, mean_ta, covariance_ta):
        del loss_ta, mean_ta, covariance_ta  # unused
        return iteration_number < num_iterations

      def _while_body(iteration_number, loss_ta, mean_ta, covariance_ta):
        """Perform a processing step on a single window of data."""
        base_offset = iteration_number * self.output_window_size
        model_outputs = self._process_window(
            features={
                feature_name:
                feature_value[:, base_offset:base_offset + self.window_size]
                for feature_name, feature_value in features.items()},
            mode=mode,
            exogenous_regressors=exogenous_regressors[
                :, base_offset:base_offset + self.window_size])
        # This code needs to be updated if new predictions are added in
        # self._process_window
        assert len(model_outputs.predictions) == 3
        assert "mean" in model_outputs.predictions
        assert "covariance" in model_outputs.predictions
        assert "observed" in model_outputs.predictions
        return (iteration_number + 1,
                loss_ta.write(
                    iteration_number, model_outputs.loss),
                mean_ta.write(
                    iteration_number, model_outputs.predictions["mean"]),
                covariance_ta.write(
                    iteration_number, model_outputs.predictions["covariance"]))
      _, loss_ta, mean_ta, covariance_ta = control_flow_ops.while_loop(
          _while_condition, _while_body,
          [0,
           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations),
           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations),
           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations)])
      values = math_ops.cast(features[TrainEvalFeatures.VALUES],
                             dtype=self.dtype)
      batch_size = array_ops.shape(times)[0]
      prediction_shape = [batch_size, self.output_window_size * num_iterations,
                          self.num_features]
      (previous_state_times,
       previous_state_values,
       previous_state_exogenous_regressors) = state
      # Make sure returned state always has windows of self.input_window_size,
      # even if we were passed fewer than self.input_window_size points this
      # time.
      if self.input_window_size > 0:
        new_state_times = array_ops.concat(
            [previous_state_times,
             math_ops.cast(times, dtype=dtypes.int64)],
            axis=1)[:, -self.input_window_size:]
        new_state_times.set_shape((None, self.input_window_size))
        new_state_values = array_ops.concat(
            [previous_state_values,
             self._scale_data(values)], axis=1)[:, -self.input_window_size:, :]
        new_state_values.set_shape((None, self.input_window_size,
                                    self.num_features))
        new_exogenous_regressors = array_ops.concat(
            [previous_state_exogenous_regressors,
             exogenous_regressors], axis=1)[:, -self.input_window_size:, :]
        new_exogenous_regressors.set_shape(
            (None,
             self.input_window_size,
             self.exogenous_size))
      else:
        # There is no state to keep, and the strided slices above do not handle
        # input_window_size=0.
        new_state_times = previous_state_times
        new_state_values = previous_state_values
        new_exogenous_regressors = previous_state_exogenous_regressors
      return model.ModelOutputs(
          loss=math_ops.reduce_mean(loss_ta.stack(), axis=0),
          end_state=(new_state_times,
                     new_state_values,
                     new_exogenous_regressors),
          predictions={
              "mean": array_ops.reshape(
                  array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]),
                  prediction_shape),
              "covariance": array_ops.reshape(
                  array_ops.transpose(covariance_ta.stack(), [1, 0, 2, 3]),
                  prediction_shape),
              "observed": values[:, -output_size:]},
          prediction_times=times[:, -output_size:])
    else:
      raise ValueError(
          "Unknown mode '{}' passed to get_batch_loss.".format(mode))
Beispiel #42
0
def _RoutingFunctionGradient(op, grad):
    """The gradient of RoutingFunction.

  Args:
    op: The RoutingFunction op.
    grad: Gradient with respect to the output of the RoutingFunction op.

  Returns:
    Gradients with respect to the input of the RoutingFunction op.
  """
    routing_gradient = _training_ops.routing_gradient

    input_data_tensor = op.inputs[0]
    tree_weights_tensor = op.inputs[1]
    tree_thresholds_tensor = op.inputs[2]

    routing_function_tensor = op.outputs[0]

    # The derivatives below are each defined over one or two of three dimensions:
    # (batch_size, num_nodes, num_features).  We explicitly expand each derivative
    # to three dimensions to ensure that they're broadcasted correctly.

    # dl / du is the derivative of the loss with respect to the output of the
    # routing function, which is provided by tensorflow.
    #
    # dl / du has dimension (batch_size, num_nodes), which we expand to
    # (batch_size, num_nodes, 1).
    dl_du = array_ops.expand_dims(grad, 2)

    # du / df is the derivative of the output of the routing function with respect
    # to the decision function at each node.  It is computed by
    # routing_gradient_op.cc.
    #
    # du / df has dimension (batch_size, num_nodes), which we expand to
    # (batch_size, num_nodes, 1).
    du_df = array_ops.expand_dims(
        routing_gradient(input_data_tensor,
                         tree_weights_tensor,
                         tree_thresholds_tensor,
                         routing_function_tensor,
                         max_nodes=op.get_attr('max_nodes')), 2)

    # df / dx is the derivative of the decision function with respect to the input
    # data.  f_i(x) = (-t_i * x + b_i), so df_i / dx = -t_i.
    #
    # df / dx has dimension (num_nodes, num_features), which we expand to
    # (1, num_nodes, num_features).
    df_dx = -array_ops.expand_dims(tree_weights_tensor, 0)

    # df / dt is the derivative of the decision function with respect to its
    # parameters. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = -x.
    #
    # df / dt has dimension (batch_size, num_features), which we expand to
    # (batch_size, 1, num_features).
    df_dt = -array_ops.expand_dims(input_data_tensor, 1)
    # df / dt is the derivative of the decision function with respect to its
    # bias parameter. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = 1.
    #
    # df / db has dimension (num_nodes), which we expand to
    # (1, num_nodes, 1).
    df_db = array_ops.expand_dims(
        array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0),
        2)

    # Compute the derivatives of the loss with respect to the inputs using the
    # chain rule (backpropagation).
    dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1)
    dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0)
    dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]),
                                 0)

    input_gradients = [dl_dx, dl_dt, dl_db]

    return input_gradients
Beispiel #43
0
def _KFeatureRoutingFunctionGradient(op, grad):
    """The gradient of RoutingFunction.

  Args:
    op: The RoutingFunction op.
    grad: Gradient with respect to the output of the RoutingFunction op.

  Returns:
    Gradients with respect to the input of the RoutingFunction op.
  """
    gradient_op = _training_ops.k_feature_gradient

    input_data_tensor = op.inputs[0]
    tree_weights_tensor = op.inputs[1]
    tree_thresholds_tensor = op.inputs[2]

    routing_function_tensor = op.outputs[0]

    # The derivatives below are each defined over one or two of three dimensions:
    # (batch_size, num_nodes, num_features).  We explicitly expand each derivative
    # to three dimensions to ensure that they're broadcasted correctly.
    du_df_raw, df_dx_raw, df_dt_raw = gradient_op(
        input_data_tensor,
        tree_weights_tensor,
        tree_thresholds_tensor,
        routing_function_tensor,
        layer_num=op.get_attr('layer_num'),
        random_seed=op.get_attr('random_seed'))

    # dl / du is the derivative of the loss with respect to the output of the
    # routing function, which is provided by tensorflow.
    #
    # dl / du has dimension (batch_size, num_nodes), which we expand to
    # (batch_size, num_nodes, 1).
    dl_du = array_ops.expand_dims(grad, 2)

    # du / df is the derivative of the output of the routing function with respect
    # to the decision function at each node.  It is computed by
    # single_feature_routing_gradient_op.cc.
    #
    # du / df has dimension (batch_size, num_nodes), which we expand to
    # (batch_size, num_nodes, 1).
    du_df = array_ops.expand_dims(du_df_raw, 2)

    # df / dx is the derivative of the decision function with respect to the input
    # data.  f(x) = (-t * x + b), so df / dx = -t for the selected features and
    # zero elsewhere.
    #
    # df / dx has dimension (num_nodes, num_features), which we expand to
    # (1, num_nodes, num_features).
    df_dx = array_ops.expand_dims(df_dx_raw, 0)

    # df / dt is the derivative of the decision function with respect to its
    # parameters. f(x) = (-t * x + b), so df / dt = -x[feature].
    #
    # df / dt has dimension (batch_size, num_nodes, num_features).
    df_dt = -df_dt_raw

    # df / dt is the derivative of the decision function with respect to its
    # bias parameter. f(x) = (-t * x + b), so df / dt = 1.
    #
    # df / db has dimension (num_nodes), which we expand to
    # (1, num_nodes, 1).
    df_db = array_ops.expand_dims(
        array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0),
        2)

    # Compute the derivatives of the loss with respect to the inputs using the
    # chain rule (backpropagation).
    dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1)
    dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0)
    dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]),
                                 0)

    input_gradients = [dl_dx, dl_dt, dl_db]

    return input_gradients
 def compute(x):
     return math_ops.reduce_mean(x, axis=0, keepdims=True)
Beispiel #45
0
 def forward_pass(value):
     count.assign_add(1)
     residuals = value - model
     loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
     # Note: count is an integer, so its doutput will be None
     return loss, count
Beispiel #46
0
    def _testMVN(self,
                 base_distribution_class,
                 base_distribution_kwargs,
                 batch_shape=(),
                 event_shape=(),
                 not_implemented_message=None):
        with self.test_session() as sess:
            # Overriding shapes must be compatible w/bijector; most bijectors are
            # batch_shape agnostic and only care about event_ndims.
            # In the case of `Affine`, if we got it wrong then it would fire an
            # exception due to incompatible dimensions.
            batch_shape_pl = array_ops.placeholder(dtypes.int32,
                                                   name="dynamic_batch_shape")
            event_shape_pl = array_ops.placeholder(dtypes.int32,
                                                   name="dynamic_event_shape")
            feed_dict = {
                batch_shape_pl: np.array(batch_shape, dtype=np.int32),
                event_shape_pl: np.array(event_shape, dtype=np.int32)
            }
            fake_mvn_dynamic = self._cls()(
                distribution=base_distribution_class(
                    validate_args=True, **base_distribution_kwargs),
                bijector=bs.Affine(shift=self._shift, scale_tril=self._tril),
                batch_shape=batch_shape_pl,
                event_shape=event_shape_pl,
                validate_args=True)

            fake_mvn_static = self._cls()(distribution=base_distribution_class(
                validate_args=True, **base_distribution_kwargs),
                                          bijector=bs.Affine(
                                              shift=self._shift,
                                              scale_tril=self._tril),
                                          batch_shape=batch_shape,
                                          event_shape=event_shape,
                                          validate_args=True)

            actual_mean = np.tile(self._shift,
                                  [2, 1])  # Affine elided this tile.
            actual_cov = np.matmul(self._tril,
                                   np.transpose(self._tril, [0, 2, 1]))

            def actual_mvn_log_prob(x):
                return np.concatenate([[
                    stats.multivariate_normal(actual_mean[i],
                                              actual_cov[i]).logpdf(x[:, i, :])
                ] for i in range(len(actual_cov))]).T

            actual_mvn_entropy = np.concatenate([[
                stats.multivariate_normal(actual_mean[i],
                                          actual_cov[i]).entropy()
            ] for i in range(len(actual_cov))])

            self.assertAllEqual([3], fake_mvn_static.event_shape)
            self.assertAllEqual([2], fake_mvn_static.batch_shape)

            self.assertAllEqual(tensor_shape.TensorShape(None),
                                fake_mvn_dynamic.event_shape)
            self.assertAllEqual(tensor_shape.TensorShape(None),
                                fake_mvn_dynamic.batch_shape)

            x = fake_mvn_static.sample(5, seed=0).eval()
            for unsupported_fn in (fake_mvn_static.log_cdf,
                                   fake_mvn_static.cdf,
                                   fake_mvn_static.survival_function,
                                   fake_mvn_static.log_survival_function):
                with self.assertRaisesRegexp(NotImplementedError,
                                             not_implemented_message):
                    unsupported_fn(x)

            num_samples = 5e3
            for fake_mvn, feed_dict in ((fake_mvn_static, {}),
                                        (fake_mvn_dynamic, feed_dict)):
                # Ensure sample works by checking first, second moments.
                y = fake_mvn.sample(int(num_samples), seed=0)
                x = y[0:5, ...]
                sample_mean = math_ops.reduce_mean(y, 0)
                centered_y = array_ops.transpose(y - sample_mean, [1, 2, 0])
                sample_cov = math_ops.matmul(
                    centered_y, centered_y, transpose_b=True) / num_samples
                [
                    sample_mean_,
                    sample_cov_,
                    x_,
                    fake_event_shape_,
                    fake_batch_shape_,
                    fake_log_prob_,
                    fake_prob_,
                    fake_entropy_,
                ] = sess.run([
                    sample_mean,
                    sample_cov,
                    x,
                    fake_mvn.event_shape_tensor(),
                    fake_mvn.batch_shape_tensor(),
                    fake_mvn.log_prob(x),
                    fake_mvn.prob(x),
                    fake_mvn.entropy(),
                ],
                             feed_dict=feed_dict)

                self.assertAllClose(actual_mean,
                                    sample_mean_,
                                    atol=0.1,
                                    rtol=0.1)
                self.assertAllClose(actual_cov, sample_cov_, atol=0., rtol=0.1)

                # Ensure all other functions work as intended.
                self.assertAllEqual([5, 2, 3], x_.shape)
                self.assertAllEqual([3], fake_event_shape_)
                self.assertAllEqual([2], fake_batch_shape_)
                self.assertAllClose(actual_mvn_log_prob(x_),
                                    fake_log_prob_,
                                    atol=0.,
                                    rtol=1e-6)
                self.assertAllClose(np.exp(actual_mvn_log_prob(x_)),
                                    fake_prob_,
                                    atol=0.,
                                    rtol=1e-5)
                self.assertAllClose(actual_mvn_entropy,
                                    fake_entropy_,
                                    atol=0.,
                                    rtol=1e-6)
Beispiel #47
0
def average(a, axis=None, weights=None, returned=False):  # pylint: disable=missing-docstring
    if axis is not None and not isinstance(axis, six.integer_types):
        # TODO(wangpeng): Support tuple of ints as `axis`
        raise ValueError('Argument `axis` must be an integer. '
                         f'Received axis={axis} (of type {type(axis)})')
    a = np_array_ops.array(a)
    if weights is None:  # Treat all weights as 1
        if not np.issubdtype(a.dtype.as_numpy_dtype, np.inexact):
            a = a.astype(
                np_utils.result_type(a.dtype, np_dtypes.default_float_type()))
        avg = math_ops.reduce_mean(a, axis=axis)
        if returned:
            if axis is None:
                weights_sum = array_ops.size(a)
            else:
                weights_sum = array_ops.shape(a)[axis]
            weights_sum = math_ops.cast(weights_sum, a.dtype)
    else:
        if np.issubdtype(a.dtype.as_numpy_dtype, np.inexact):
            out_dtype = np_utils.result_type(a.dtype, weights)
        else:
            out_dtype = np_utils.result_type(a.dtype, weights,
                                             np_dtypes.default_float_type())
        a = np_array_ops.array(a, out_dtype)
        weights = np_array_ops.array(weights, out_dtype)

        def rank_equal_case():
            control_flow_ops.Assert(
                math_ops.reduce_all(
                    array_ops.shape(a) == array_ops.shape(weights)),
                [array_ops.shape(a),
                 array_ops.shape(weights)])
            weights_sum = math_ops.reduce_sum(weights, axis=axis)
            avg = math_ops.reduce_sum(a * weights, axis=axis) / weights_sum
            return avg, weights_sum

        if axis is None:
            avg, weights_sum = rank_equal_case()
        else:

            def rank_not_equal_case():
                control_flow_ops.Assert(
                    array_ops.rank(weights) == 1, [array_ops.rank(weights)])
                weights_sum = math_ops.reduce_sum(weights)
                axes = ops.convert_to_tensor([[axis], [0]])
                avg = math_ops.tensordot(a, weights, axes) / weights_sum
                return avg, weights_sum

            # We condition on rank rather than shape equality, because if we do the
            # latter, when the shapes are partially unknown but the ranks are known
            # and different, np_utils.cond will run shape checking on the true branch,
            # which will raise a shape-checking error.
            avg, weights_sum = np_utils.cond(
                math_ops.equal(array_ops.rank(a), array_ops.rank(weights)),
                rank_equal_case, rank_not_equal_case)

    avg = np_array_ops.array(avg)
    if returned:
        weights_sum = np_array_ops.broadcast_to(weights_sum,
                                                array_ops.shape(avg))
        return avg, weights_sum
    return avg
Beispiel #48
0
def _single_op_with_attrs():
  inputs = keras.Input(shape=(10,))
  x = math_ops.reduce_mean(inputs, axis=1, keepdims=True)
  outputs = keras.layers.Dense(10)(x)
  return inputs, outputs
Beispiel #49
0
 def computation(x):
     return math_ops.reduce_mean(x)
Beispiel #50
0
    def testSampleLarge(self):
        mu = np.array([-1., 1], dtype=np.float32)
        scale_tril = np.array([[3., 0], [1, -2]], dtype=np.float32) / 3.

        true_mean = mu
        true_scale = scale_tril
        true_covariance = np.matmul(true_scale, true_scale.T)
        true_variance = np.diag(true_covariance)
        true_stddev = np.sqrt(true_variance)
        true_det_covariance = np.linalg.det(true_covariance)
        true_log_det_covariance = np.log(true_det_covariance)

        with self.test_session() as sess:
            dist = ds.MultivariateNormalTriL(loc=mu,
                                             scale_tril=scale_tril,
                                             validate_args=True)

            # The following distributions will test the KL divergence calculation.
            mvn_chol = ds.MultivariateNormalTriL(
                loc=np.array([0.5, 1.2], dtype=np.float32),
                scale_tril=np.array([[3., 0], [1, 2]], dtype=np.float32),
                validate_args=True)

            n = int(10e3)
            samps = dist.sample(n, seed=0)
            sample_mean = math_ops.reduce_mean(samps, 0)
            x = samps - sample_mean
            sample_covariance = math_ops.matmul(x, x, transpose_a=True) / n

            sample_kl_chol = math_ops.reduce_mean(
                dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
            analytical_kl_chol = ds.kl(dist, mvn_chol)

            scale = dist.scale.to_dense()

            [
                sample_mean_,
                analytical_mean_,
                sample_covariance_,
                analytical_covariance_,
                analytical_variance_,
                analytical_stddev_,
                analytical_log_det_covariance_,
                analytical_det_covariance_,
                sample_kl_chol_,
                analytical_kl_chol_,
                scale_,
            ] = sess.run([
                sample_mean,
                dist.mean(),
                sample_covariance,
                dist.covariance(),
                dist.variance(),
                dist.stddev(),
                dist.log_det_covariance(),
                dist.det_covariance(),
                sample_kl_chol,
                analytical_kl_chol,
                scale,
            ])

            sample_variance_ = np.diag(sample_covariance_)
            sample_stddev_ = np.sqrt(sample_variance_)
            sample_det_covariance_ = np.linalg.det(sample_covariance_)
            sample_log_det_covariance_ = np.log(sample_det_covariance_)

            logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
            logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
            logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_))

            logging.vlog(2, "true_covariance:\n{}".format(true_covariance))
            logging.vlog(2,
                         "sample_covariance:\n{}".format(sample_covariance_))
            logging.vlog(
                2, "analytical_covariance:\n{}".format(analytical_covariance_))

            logging.vlog(2, "true_variance:\n{}".format(true_variance))
            logging.vlog(2, "sample_variance:\n{}".format(sample_variance_))
            logging.vlog(
                2, "analytical_variance:\n{}".format(analytical_variance_))

            logging.vlog(2, "true_stddev:\n{}".format(true_stddev))
            logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
            logging.vlog(2,
                         "analytical_stddev:\n{}".format(analytical_stddev_))

            logging.vlog(
                2,
                "true_log_det_covariance:\n{}".format(true_log_det_covariance))
            logging.vlog(
                2, "sample_log_det_covariance:\n{}".format(
                    sample_log_det_covariance_))
            logging.vlog(
                2, "analytical_log_det_covariance:\n{}".format(
                    analytical_log_det_covariance_))

            logging.vlog(
                2, "true_det_covariance:\n{}".format(true_det_covariance))
            logging.vlog(
                2, "sample_det_covariance:\n{}".format(sample_det_covariance_))
            logging.vlog(
                2, "analytical_det_covariance:\n{}".format(
                    analytical_det_covariance_))

            logging.vlog(2, "true_scale:\n{}".format(true_scale))
            logging.vlog(2, "scale:\n{}".format(scale_))

            logging.vlog(
                2, "kl_chol:      analytical:{}  sample:{}".format(
                    analytical_kl_chol_, sample_kl_chol_))

            self.assertAllClose(true_mean, sample_mean_, atol=0., rtol=0.03)
            self.assertAllClose(true_mean,
                                analytical_mean_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_covariance,
                                sample_covariance_,
                                atol=0.,
                                rtol=0.03)
            self.assertAllClose(true_covariance,
                                analytical_covariance_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_variance,
                                sample_variance_,
                                atol=0.,
                                rtol=0.02)
            self.assertAllClose(true_variance,
                                analytical_variance_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_stddev,
                                sample_stddev_,
                                atol=0.,
                                rtol=0.01)
            self.assertAllClose(true_stddev,
                                analytical_stddev_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_log_det_covariance,
                                sample_log_det_covariance_,
                                atol=0.,
                                rtol=0.04)
            self.assertAllClose(true_log_det_covariance,
                                analytical_log_det_covariance_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_det_covariance,
                                sample_det_covariance_,
                                atol=0.,
                                rtol=0.03)
            self.assertAllClose(true_det_covariance,
                                analytical_det_covariance_,
                                atol=0.,
                                rtol=1e-6)

            self.assertAllClose(true_scale, scale_, atol=0., rtol=1e-6)

            self.assertAllClose(sample_kl_chol_,
                                analytical_kl_chol_,
                                atol=0.,
                                rtol=0.02)
Beispiel #51
0
 def fn():
     a = math_ops.add(x.value(), 1.0)
     # Make sure convert_to_tensor works correctly with list of TensorNodes.
     b = array_ops.stack([a, a], axis=0)
     return math_ops.reduce_mean(b)
Beispiel #52
0
    def call(self, inputs, training=False):
        if self.virtual_batch_size is not None:
            # Virtual batches (aka ghost batches) can be simulated by reshaping the
            # Tensor and reusing the existing batch norm implementation
            original_shape = [-1] + inputs.shape.as_list()[1:]
            expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

            # Will cause errors if virtual_batch_size does not divide the batch size
            inputs = array_ops.reshape(inputs, expanded_shape)

            def undo_virtual_batching(outputs):
                outputs = array_ops.reshape(outputs, original_shape)
                return outputs

        if self.fused:
            outputs = self._fused_batch_norm(inputs, training=training)
            if self.virtual_batch_size is not None:
                # Currently never reaches here since fused_batch_norm does not support
                # virtual batching
                return undo_virtual_batching(outputs)
            return outputs

        # Compute the axes along which to reduce the mean / variance
        input_shape = inputs.get_shape()
        ndims = len(input_shape)
        reduction_axes = [i for i in range(ndims) if i not in self.axis]
        if self.virtual_batch_size is not None:
            del reduction_axes[1]  # Do not reduce along virtual batch dim

        # Broadcasting only necessary for single-axis batch norm where the axis is
        # not the last dimension
        broadcast_shape = [1] * ndims
        broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value

        def _broadcast(v):
            if (v is not None and len(v.get_shape()) != ndims
                    and reduction_axes != list(range(ndims - 1))):
                return array_ops.reshape(v, broadcast_shape)
            return v

        scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

        def _compose_transforms(scale, offset, then_scale, then_offset):
            if then_scale is not None:
                scale *= then_scale
                offset *= then_scale
            if then_offset is not None:
                offset += then_offset
            return (scale, offset)

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = utils.constant_value(training)
        if training_value is not False:
            if self.adjustment:
                adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
                # Adjust only during training.
                adj_scale = utils.smart_cond(
                    training, lambda: adj_scale,
                    lambda: array_ops.ones_like(adj_scale))
                adj_bias = utils.smart_cond(
                    training, lambda: adj_bias,
                    lambda: array_ops.zeros_like(adj_bias))
                scale, offset = _compose_transforms(adj_scale, adj_bias, scale,
                                                    offset)

            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            keep_dims = self.virtual_batch_size is not None or len(
                self.axis) > 1
            mean, variance = nn.moments(inputs,
                                        reduction_axes,
                                        keep_dims=keep_dims)

            moving_mean = self.moving_mean
            moving_variance = self.moving_variance

            mean = utils.smart_cond(training, lambda: mean,
                                    lambda: moving_mean)
            variance = utils.smart_cond(training, lambda: variance,
                                        lambda: moving_variance)

            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    mean, variance, training)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
                d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
                scale, offset = _compose_transforms(r, d, scale, offset)
            else:
                new_mean, new_variance = mean, variance

            if self.virtual_batch_size is not None:
                # This isn't strictly correct since in ghost batch norm, you are
                # supposed to sequentially update the moving_mean and moving_variance
                # with each sub-batch. However, since the moving statistics are only
                # used during evaluation, it is more efficient to just update in one
                # step and should not make a significant difference in the result.
                new_mean = math_ops.reduce_mean(new_mean,
                                                axis=1,
                                                keep_dims=True)
                new_variance = math_ops.reduce_mean(new_variance,
                                                    axis=1,
                                                    keep_dims=True)

            def _do_update(var, value):
                return moving_averages.assign_moving_average(var,
                                                             value,
                                                             self.momentum,
                                                             zero_debias=False)

            mean_update = utils.smart_cond(
                training, lambda: _do_update(self.moving_mean, new_mean),
                lambda: self.moving_mean)
            variance_update = utils.smart_cond(
                training,
                lambda: _do_update(self.moving_variance, new_variance),
                lambda: self.moving_variance)
            if context.in_graph_mode():
                self.add_update(mean_update, inputs=inputs)
                self.add_update(variance_update, inputs=inputs)

        else:
            mean, variance = self.moving_mean, self.moving_variance

        outputs = nn.batch_normalization(inputs, _broadcast(mean),
                                         _broadcast(variance), offset, scale,
                                         self.epsilon)
        # If some components of the shape got lost due to adjustments, fix that.
        outputs.set_shape(input_shape)

        if self.virtual_batch_size is not None:
            return undo_virtual_batching(outputs)

        return outputs
 def body(i, x, y):
     s = array_ops.slice(x, [i, i, i], [1, 1, 2048])
     y = y + math_ops.reduce_mean(s)
     i = i + 1
     return (i, x, y)
Beispiel #54
0
 def loss(x, l):
     return math_ops.reduce_mean(
         nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l),
         constant_op.constant([0]))
Beispiel #55
0
    def result(self):
        """Add option to remove summary."""
        if (self.curve == metrics_utils.AUCCurve.PR and self.summation_method
                == metrics_utils.AUCSummationMethod.INTERPOLATION):
            # This use case is different and is handled separately.
            return self.interpolate_pr_auc()

        # Set `x` and `y` values for the curves based on `curve` config.
        recall = math_ops.div_no_nan(
            self.true_positives, self.true_positives + self.false_negatives)
        if self.curve == metrics_utils.AUCCurve.ROC:
            fp_rate = math_ops.div_no_nan(
                self.false_positives,
                self.false_positives + self.true_negatives)
            x = fp_rate
            y = recall
        else:  # curve == 'PR'.
            precision = math_ops.div_no_nan(
                self.true_positives,
                self.true_positives + self.false_positives)
            x = recall
            y = precision

        # Find the rectangle heights based on `summation_method`.
        if self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION:
            # Note: the case ('PR', 'interpolation') has been handled above.
            heights = (y[:self.num_thresholds - 1] + y[1:]) / 2.
        elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING:
            heights = math_ops.minimum(y[:self.num_thresholds - 1], y[1:])
        else:  # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING:
            heights = math_ops.maximum(y[:self.num_thresholds - 1], y[1:])

        # Sum up the areas of all the rectangles.
        if self.multi_label:
            riemann_terms = math_ops.multiply(
                x[:self.num_thresholds - 1] - x[1:], heights)
            by_label_auc = math_ops.reduce_sum(riemann_terms,
                                               name=self.name + '_by_label',
                                               axis=0)

            if self._summarize:
                if self.label_weights is None:
                    # Unweighted average of the label AUCs.
                    return math_ops.reduce_mean(by_label_auc, name=self.name)
                else:
                    # Weighted average of the label AUCs.
                    return math_ops.div_no_nan(math_ops.reduce_sum(
                        math_ops.multiply(by_label_auc, self.label_weights)),
                                               math_ops.reduce_sum(
                                                   self.label_weights),
                                               name=self.name)
            else:
                return by_label_auc
        else:
            if self._summarize:
                return math_ops.reduce_sum(math_ops.multiply(
                    x[:self.num_thresholds - 1] - x[1:], heights),
                                           name=self.name)
            else:
                return math_ops.multiply(x[:self.num_thresholds - 1] - x[1:],
                                         heights)
 def call(self, inputs):
     out = self.bias(inputs[0])
     self.add_loss(MAE()(inputs[1], out, inputs[2]))
     self.add_loss(
         math_ops.reduce_mean(inputs[2] * mae(inputs[1], out)))
     return out
Beispiel #57
0
def ngrams(data,
           width,
           axis=-1,
           reduction_type=None,
           string_separator=" ",
           name=None):
    """Create a tensor of n-grams based on the input data `data`.

  Creates a tensor of n-grams based on `data`. The n-grams are of width `width`
  and are created along axis `axis`; the n-grams are created by combining
  windows of `width` adjacent elements from `data` using `reduction_type`. This
  op is intended to cover basic use cases; more complex combinations can be
  created using the sliding_window op.

  Args:
    data: The data to reduce.
    width: The width of the ngram window. If there is not sufficient data to
      fill out the ngram window, the resulting ngram will be empty.
    axis: The axis to create ngrams along. Note that for string join reductions,
      only axis '-1' is supported; for other reductions, any positive or
      negative axis can be used. Should be a constant.
    reduction_type: A member of the Reduction enum. Should be a constant.
      Currently supports:

      * `Reduction.SUM`: Add values in the window.
      * `Reduction.MEAN`: Average values in the window.
      * `Reduction.STRING_JOIN`: Join strings in the window.
        Note that axis must be -1 here.

    string_separator: The separator string used for `Reduction.STRING_JOIN`.
      Ignored otherwise. Must be a string constant, not a Tensor.
    name: The op name.

  Returns:
    A tensor of ngrams.

  Raises:
    InvalidArgumentError: if `reduction_type` is either None or not a Reduction,
      or if `reduction_type` is STRING_JOIN and `axis` is not -1.
  """

    with ops.name_scope(name, "NGrams", [data, width]):
        if reduction_type is None:
            raise errors.InvalidArgumentError(
                None, None, "reduction_type must be specified.")

        if not isinstance(reduction_type, Reduction):
            raise errors.InvalidArgumentError(
                None, None, "reduction_type must be a Reduction.")

        # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done.
        if reduction_type is Reduction.STRING_JOIN and axis != -1:
            raise errors.InvalidArgumentError(
                None, None,
                "%s requires that ngrams' 'axis' parameter be -1." %
                Reduction.STRING_JOIN.name)

        windowed_data = sliding_window(data, width, axis)

        if axis < 0:
            reduction_axis = axis
        else:
            reduction_axis = axis + 1

        # Ragged reduction ops work on both Tensor and RaggedTensor, so we can
        # use them here regardless of the type of tensor in 'windowed_data'.
        if reduction_type is Reduction.SUM:
            return math_ops.reduce_sum(windowed_data, reduction_axis)
        elif reduction_type is Reduction.MEAN:
            return math_ops.reduce_mean(windowed_data, reduction_axis)
        elif reduction_type is Reduction.STRING_JOIN:
            if isinstance(data, ragged_tensor.RaggedTensor):
                return ragged_functional_ops.map_flat_values(
                    string_ops.reduce_join,
                    windowed_data,
                    axis=axis,
                    separator=string_separator)
            else:
                return string_ops.reduce_join(windowed_data,
                                              axis=axis,
                                              separator=string_separator)
 def testEmptyGradients(self):
     with self.session(use_gpu=True):
         x = array_ops.zeros([0, 3])
         y = math_ops.reduce_mean(x, [1])
         error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
         self.assertEqual(error, 0)
Beispiel #59
0
 def test_loss():
     test_prediction = line_template(test_input)
     return math_ops.reduce_mean(
         math_ops.square(test_prediction - test_output))
 def normal_function():
   x = random_ops.random_uniform((2, 10))
   y = random_ops.random_uniform((10, 2))
   return math_ops.reduce_mean(math_ops.matmul(x, y))