Ejemplo n.º 1
 def f(a, b):  # pylint: disable=missing-docstring
     dtype = a.dtype
     if np.issubdtype(dtype.as_numpy_dtype, np.inexact):
         rtol_ = ops.convert_to_tensor(rtol, dtype.real_dtype)
         atol_ = ops.convert_to_tensor(atol, dtype.real_dtype)
         result = (math_ops.abs(a - b) <= atol_ + rtol_ * math_ops.abs(b))
         if equal_nan:
             result = result | (math_ops.is_nan(a) & math_ops.is_nan(b))
         return result
         return a == b
Ejemplo n.º 2
  def testUniformNans(self):
    a = 10.0
    b = [11.0, 100.0]
    uniform = uniform_lib.Uniform(low=a, high=b)

    no_nans = constant_op.constant(1.0)
    nans = constant_op.constant(0.0) / constant_op.constant(0.0)
    with_nans = array_ops.stack([no_nans, nans])

    pdf = uniform.prob(with_nans)

    is_nan = self.evaluate(math_ops.is_nan(pdf))
Ejemplo n.º 4
  def check_grads(grads_and_vars):
    Check wether the gradients contain Inf or Nan.
            list of tuple (grad, var),
            normally the output of opt.compute_gradients
        has_nan: bool, True if there is Nan, otherwise it will be False
        amax: tensor denoting the maximum value in gradients
    has_nan_ops = []
    amax_ops = []

    for grad in grads_and_vars:
      if isinstance(grad, tuple):
        grad = grad[0]
      if grad is not None:
        if isinstance(grad, ops.IndexedSlices):
          x = grad.values
          x = grad

        if x.dtype != dtypes.float32:
          x = math_ops.cast(x, dtypes.float32)

    has_nan = math_ops.reduce_any(has_nan_ops)
    amax = math_ops.reduce_max(amax_ops)
    return has_nan, amax
Ejemplo n.º 10
    def _apply_transform(self, input_tensors):
        """Applies the transformation to the `transform_input`.

        input_tensors: a list of Tensors representing the input to
        the Transform.

        A namedtuple of Tensors representing the transformed output.
        d = input_tensors[0]

        if self.strip_value is np.nan:
            strip_hot = math_ops.is_nan(d)
            strip_hot = math_ops.equal(
                d, array_ops.constant([self.strip_value], dtype=d.dtype))
        keep_hot = math_ops.logical_not(strip_hot)

        length = array_ops.reshape(array_ops.shape(d), [])
        indices = array_ops.boolean_mask(math_ops.range(length), keep_hot)
        values = array_ops.boolean_mask(d, keep_hot)

        sparse_indices = array_ops.reshape(
            math_ops.cast(indices, dtypes.int64), [-1, 1])
        shape = math_ops.cast(array_ops.shape(d), dtypes.int64)

        # pylint: disable=not-callable
        return self.return_type(ops.SparseTensor(sparse_indices, values,
Ejemplo n.º 18
def _calculate_acceptance_probabilities(init_probs, target_probs):
  """Calculate the per-class acceptance rates.

    init_probs: The class probabilities of the data.
    target_probs: The desired class proportion in minibatches.
    A list of the per-class acceptance probabilities.

  This method is based on solving the following analysis:

  Let F be the probability of a rejection (on any example).
  Let p_i be the proportion of examples in the data in class i (init_probs)
  Let a_i is the rate the rejection sampler should *accept* class i
  Let t_i is the target proportion in the minibatches for class i (target_probs)

  F = sum_i(p_i * (1-a_i))
    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1

  An example with class `i` will be accepted if `k` rejections occur, then an
  example with class `i` is seen by the rejector, and it is accepted. This can
  be written as follows:

  t_i = sum_k=0^inf(F^k * p_i * a_i)
      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
      = p_i * a_i / sum_j(p_j * a_j)        using F from above

  Note that the following constraints hold:
  0 <= p_i <= 1, sum_i(p_i) = 1
  0 <= a_i <= 1
  0 <= t_i <= 1, sum_i(t_i) = 1

  A solution for a_i in terms of the other variabes is the following:
    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
  # Make list of t_i / p_i.
  ratio_l = target_probs / init_probs

  # Replace NaNs with 0s.
  ratio_l = math_ops.select(math_ops.is_nan(ratio_l),

  # Calculate list of acceptance probabilities.
  max_ratio = math_ops.reduce_max(ratio_l)
  return ratio_l / max_ratio
Ejemplo n.º 23
def sparsemax_loss(logits, sparsemax, labels, name=None):
    """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

    A `Tensor`. Has the same type as `logits`.

    with ops.name_scope(name, "sparsemax_loss",
                        [logits, sparsemax, labels]) as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
        labels = ops.convert_to_tensor(labels, name="labels")

        # In the paper, they call the logits z.
        # A constant can be substracted from logits to make the algorithm
        # more numerically stable in theory. However, there are really no major
        # source numerical instability in this algorithm.
        z = logits

        # sum over support
        # Use a conditional where instead of a multiplication to support z = -inf.
        # If z = -inf, and there is no support (sparsemax = 0), a multiplication
        # would cause 0 * -inf = nan, which is not correct in this case.
        sum_s = array_ops.where(
            math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
            sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

        # - z_k + ||q||^2
        q_part = labels * (0.5 * labels - z)
        # Fix the case where labels = 0 and z = -inf, where q_part would
        # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
        # z = -inf should be consideredself.
        # The code below also coveres the case where z = inf. Howeverm in this
        # caose the sparsemax will be nan, which means the sum_s will also be nan,
        # therefor this case doesn't need addtional special treatment.
        q_part_safe = array_ops.where(
                                                0), math_ops.is_inf(z)),
            array_ops.zeros_like(z), q_part)

        return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Ejemplo n.º 25
def sparsemax_loss(logits, sparsemax, labels, name=None):
  """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

    A `Tensor`. Has the same type as `logits`.

  with ops.name_scope(name, "sparsemax_loss",
                      [logits, sparsemax, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
    labels = ops.convert_to_tensor(labels, name="labels")

    # In the paper, they call the logits z.
    # A constant can be substracted from logits to make the algorithm
    # more numerically stable in theory. However, there are really no major
    # source numerical instability in this algorithm.
    z = logits

    # sum over support
    # Use a conditional where instead of a multiplication to support z = -inf.
    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
    # would cause 0 * -inf = nan, which is not correct in this case.
    sum_s = array_ops.where(
        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

    # - z_k + ||q||^2
    q_part = labels * (0.5 * labels - z)
    # Fix the case where labels = 0 and z = -inf, where q_part would
    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
    # z = -inf should be consideredself.
    # The code below also coveres the case where z = inf. Howeverm in this
    # caose the sparsemax will be nan, which means the sum_s will also be nan,
    # therefor this case doesn't need addtional special treatment.
    q_part_safe = array_ops.where(
        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
        array_ops.zeros_like(z), q_part)

    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Ejemplo n.º 30
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
           x_log_prob=None, x_grad=None, skip_metropolis_step=False, name=None):
  """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes
  that all dimensions of `x` not specified in `event_dims` are
  independent, and should therefore be updated independently. The
  output of `target_log_prob_fn()` should sum log-probabilities across
  all event dimensions. Slices along dimensions not in `event_dims`
  may have different target distributions; for example, if
  `event_dims == (1,)`, then `x[0, :]` could have a different target
  distribution from x[1, :]. This is up to `target_log_prob_fn()`.

    step_size: Scalar step size or array of step sizes for the
      leapfrog integrator. Broadcasts to the shape of
      `x`. Larger step sizes lead to faster progress, but
      too-large step sizes make rejection exponentially more likely.
      When possible, it's often helpful to match per-variable step
      sizes to the standard deviations of the target distribution in
      each variable.
    n_leapfrog_steps: Integer number of steps to run the leapfrog
      integrator for. Total progress per HMC step is roughly
      proportional to step_size * n_leapfrog_steps.
    x: Tensor containing the value(s) of the random variable(s) to update.
    target_log_prob_fn: Python callable which takes an argument like `initial_x`
      and returns its (possibly unnormalized) log-density under the target
    event_dims: List of dimensions that should not be treated as
      independent. This allows for multiple chains to be run independently
      in parallel. Default is (), i.e., all dimensions are independent.
    x_log_prob (optional): Tensor containing the cached output of a previous
      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    x_grad (optional): Tensor containing the cached gradient of
      `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    skip_metropolis_step (optional): boolean specifying whether to skip the
      Metropolis-Hastings step and directly return the newly proposed values
      by the integrator. The acceptance probabilities returned remain unchanged.
    name: Python `str` name prefixed to Ops created by this function.

    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
    acceptance_probs: Tensor with the acceptance probabilities for the final
      iteration. This is useful for diagnosing step size problems etc. Has
      shape matching `target_log_prob_fn(initial_x)`.
    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at

  #### Examples:

  # Tuning acceptance rates:
  target_accept_rate = 0.631
  def target_log_prob(x):
    # Standard normal
    return tf.reduce_sum(-0.5 * tf.square(x))
  initial_x = tf.zeros([10])
  initial_log_prob = target_log_prob(initial_x)
  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
  # Algorithm state
  x = tf.Variable(initial_x, name='x')
  step_size = tf.Variable(1., name='step_size')
  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
  last_grad = tf.Variable(initial_grad, name='last_grad')
  # Compute updates
  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
  x_update = tf.assign(x, new_x)
  log_prob_update = tf.assign(last_log_prob, log_prob)
  grad_update = tf.assign(last_grad, grad)
  step_size_update = tf.assign(step_size,
                               tf.where(acceptance_prob > target_accept_rate,
                                        step_size * 1.01, step_size / 1.01))
  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
  sampling_updates = [x_update, log_prob_update, grad_update]

  sess = tf.Session()
  # Warm up the sampler and adapt the step size
  for i in xrange(500):
  # Collect samples without adapting step size
  samples = np.zeros([500, 10])
  for i in xrange(500):
    x_val, _ = sess.run([new_x, sampling_updates])
    samples[i] = x_val

  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:

  # Problem setup
  N = 150
  D = 10
  x = np.random.randn(N, D).astype(np.float32)
  true_sigma = 0.5
  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)

  def log_prior(beta, log_sigma):
    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
  def regression_log_joint(beta, log_sigma, x, y):
    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
    means = tf.squeeze(means)
    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
    return log_prior(beta, log_sigma) + log_likelihood
  def log_joint_partial(beta):
    return regression_log_joint(beta, log_sigma, x, y)
  # Our estimate of log(sigma)
  log_sigma = tf.Variable(0., name='log_sigma')
  # The state of the Markov chain
  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
  beta_update = tf.assign(beta, new_beta)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  with tf.control_dependencies([beta_update]):
    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),

  sess = tf.Session()
  log_sigma_history = np.zeros(1000)
  for i in xrange(1000):
    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
    log_sigma_history[i] = log_sigma_val
  # Should converge to something close to true_sigma
  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
    x = ops.convert_to_tensor(x, name='x')

    x_shape = array_ops.shape(x)
    m = random_ops.random_normal(x_shape, dtype=x.dtype)

    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)

    if (x_log_prob is not None) and (x_grad is not None):
      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
      if x_log_prob is not None:
        logging.warn('x_log_prob was provided, but x_grad was not,'
                     ' so x_log_prob was not used.')
      if x_grad is not None:
        logging.warn('x_grad was provided, but x_log_prob was not,'
                     ' so x_grad was not used.')
      log_potential_0, grad_0 = potential_and_grad(x)

    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)

    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)

    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
    # Treat NaN as infinite energy (and therefore guaranteed rejection).
    energy_change = array_ops.where(
    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
    # If we are skipping the MH step directly return
    if skip_metropolis_step:
      return new_x, acceptance_probs, -log_potential_1, -grad_1
    accepted = (
            array_ops.shape(acceptance_probs), dtype=x.dtype)
        < acceptance_probs)
    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)

    # TODO(b/65738010): This should work, but it doesn't for now.
    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
    accepted = array_ops.reshape(accepted, reduced_shape)
    accepted = math_ops.logical_or(
        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
    new_x = array_ops.where(accepted, new_x, x)
    new_grad = -array_ops.where(accepted, grad_1, grad_0)

  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
  # should be fixed.
  return new_x, acceptance_probs, new_log_prob, new_grad
