Esempio n. 1
0
    def _cdf(self, counts):
        counts = self._maybe_assert_valid_sample(counts)
        probs = self._probs_parameter_no_checks()
        if not (tensorshape_util.is_fully_defined(counts.shape)
                and tensorshape_util.is_fully_defined(probs.shape)
                and tensorshape_util.is_compatible_with(
                    counts.shape, probs.shape)):
            # If both shapes are well defined and equal, we skip broadcasting.
            probs = probs + tf.zeros_like(counts)
            counts = counts + tf.zeros_like(probs)

        return _bdtr(k=counts,
                     n=tf.convert_to_tensor(self.total_count),
                     p=probs)
 def _mean(self):
     # Derivation: https://sachinruk.github.io/blog/von-Mises-Fisher/
     event_dim = tf.compat.dimension_value(self.event_shape[0])
     if event_dim is None:
         raise ValueError(
             'event shape must be statically known for _bessel_ive')
     safe_conc = tf.where(self.concentration > 0, self.concentration,
                          tf.ones_like(self.concentration))
     safe_mean = self.mean_direction * (
         _bessel_ive(event_dim / 2, safe_conc) /
         _bessel_ive(event_dim / 2 - 1, safe_conc))[..., tf.newaxis]
     return tf.where(
         self.concentration[..., tf.newaxis] > tf.zeros_like(safe_mean),
         safe_mean, tf.zeros_like(safe_mean))
Esempio n. 3
0
    def _log_normalization(self, concentration=None, name='log_normalization'):
        """Returns the log normalization of an LKJ distribution.

    Args:
      concentration: `float` or `double` `Tensor`. The positive concentration
        parameter of the LKJ distributions.
      name: Python `str` name prefixed to Ops created by this function.

    Returns:
      log_z: A Tensor of the same shape and dtype as `concentration`, containing
        the corresponding log normalizers.
    """
        # The formula is from D. Lewandowski et al [1], p. 1999, from the
        # proof that eqs 16 and 17 are equivalent.
        with tf.name_scope(name or 'log_normalization_lkj'):
            concentration = (tf.convert_to_tensor(
                self.concentration if concentration is None else concentration)
                             )
            logpi = np.log(np.pi)
            ans = tf.zeros_like(concentration)
            for k in range(1, self.dimension):
                ans = ans + logpi * (k / 2.)
                ans = ans + tf.math.lgamma(concentration +
                                           (self.dimension - 1 - k) / 2.)
                ans = ans - tf.math.lgamma(concentration +
                                           (self.dimension - 1) / 2.)
            return ans
Esempio n. 4
0
    def _survival_function(self, y):
        low = self._low
        high = self._high

        # Recall the promise:
        # survival_function(y) := P[Y > y]
        #                       = 0, if y >= high,
        #                       = 1, if y < low,
        #                       = P[X > y], otherwise.

        # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
        # between.
        j = tf.math.ceil(y)

        # P[X > j], used when low < X < high.
        result_so_far = self.distribution.survival_function(j)

        # Re-define values at the cutoffs.
        if low is not None:
            result_so_far = tf.where(j < low, tf.ones_like(result_so_far),
                                     result_so_far)
        if high is not None:
            result_so_far = tf.where(j >= high, tf.zeros_like(result_so_far),
                                     result_so_far)

        return result_so_far
Esempio n. 5
0
    def _cdf(self, y):
        low = self._low
        high = self._high

        # Recall the promise:
        # cdf(y) := P[Y <= y]
        #         = 1, if y >= high,
        #         = 0, if y < low,
        #         = P[X <= y], otherwise.

        # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
        # between.
        j = tf.floor(y)

        # P[X <= j], used when low < X < high.
        result_so_far = self.distribution.cdf(j)

        # Re-define values at the cutoffs.
        if low is not None:
            result_so_far = tf.where(j < low, tf.zeros_like(result_so_far),
                                     result_so_far)
        if high is not None:
            result_so_far = tf.where(j >= high, tf.ones_like(result_so_far),
                                     result_so_far)

        return result_so_far
Esempio n. 6
0
    def _log_cdf(self, y):
        low = self._low
        high = self._high

        # Recall the promise:
        # cdf(y) := P[Y <= y]
        #         = 1, if y >= high,
        #         = 0, if y < low,
        #         = P[X <= y], otherwise.

        # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
        # between.
        j = tf.floor(y)

        result_so_far = self.distribution.log_cdf(j)

        # Re-define values at the cutoffs.
        if low is not None:
            result_so_far = tf.where(
                j < low,
                dtype_util.as_numpy_dtype(self.dtype)(-np.inf), result_so_far)
        if high is not None:
            result_so_far = tf.where(j >= high, tf.zeros_like(result_so_far),
                                     result_so_far)

        return result_so_far
 def _sample_3d(self, n, seed=None):
     """Specialized inversion sampler for 3D."""
     seed = SeedStream(seed, salt='von_mises_fisher_3d')
     u_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0)
     z = tf.random.uniform(u_shape, seed=seed(), dtype=self.dtype)
     # TODO(bjp): Higher-order odd dim analytic CDFs are available in [1], could
     # be bisected for bounded sampling runtime (i.e. not rejection sampling).
     # [1]: Inversion sampler via: https://ieeexplore.ieee.org/document/7347705/
     # The inversion is: u = 1 + log(z + (1-z)*exp(-2*kappa)) / kappa
     # We must protect against both kappa and z being zero.
     safe_conc = tf.where(self.concentration > 0, self.concentration,
                          tf.ones_like(self.concentration))
     safe_z = tf.where(z > 0, z, tf.ones_like(z))
     safe_u = 1 + tf.reduce_logsumexp(
         [tf.math.log(safe_z),
          tf.math.log1p(-safe_z) - 2 * safe_conc],
         axis=0) / safe_conc
     # Limit of the above expression as kappa->0 is 2*z-1
     u = tf.where(self.concentration > tf.zeros_like(safe_u), safe_u,
                  2 * z - 1)
     # Limit of the expression as z->0 is -1.
     u = tf.where(tf.equal(z, 0), -tf.ones_like(u), u)
     if not self._allow_nan_stats:
         u = tf.debugging.check_numerics(u, 'u in _sample_3d')
     return u[..., tf.newaxis]
 def _covariance(self):
     # Derivation: https://sachinruk.github.io/blog/von-Mises-Fisher/
     event_dim = tf.compat.dimension_value(self.event_shape[0])
     if event_dim is None:
         raise ValueError(
             'event shape must be statically known for _bessel_ive')
     # TODO(bjp): Enable this; numerically unstable.
     if event_dim > 2:
         raise ValueError(
             'vMF covariance is numerically unstable for dim>2')
     concentration = self.concentration[..., tf.newaxis]
     safe_conc = tf.where(concentration > 0, concentration,
                          tf.ones_like(concentration))
     h = (_bessel_ive(event_dim / 2, safe_conc) /
          _bessel_ive(event_dim / 2 - 1, safe_conc))
     intermediate = (
         tf.matmul(self.mean_direction[..., :, tf.newaxis],
                   self.mean_direction[..., tf.newaxis, :]) *
         (1 - event_dim * h / safe_conc - h**2)[..., tf.newaxis])
     cov = tf.linalg.set_diag(
         intermediate,
         tf.linalg.diag_part(intermediate) + (h / safe_conc))
     return tf.where(
         concentration[..., tf.newaxis] > tf.zeros_like(cov), cov,
         tf.linalg.eye(event_dim, batch_shape=self.batch_shape_tensor()) /
         event_dim)
Esempio n. 9
0
 def _cdf(self, x):
   # CDF is the probability that the Poisson variable is less or equal to x.
   # For fractional x, the CDF is equal to the CDF at n = floor(x).
   # For negative x, the CDF is zero, but tf.igammac gives NaNs, so we impute
   # the values and handle this case explicitly.
   safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 0.)
   cdf = tf.math.igammac(1. + safe_x, self._rate_parameter_no_checks())
   return tf.where(x < 0., tf.zeros_like(cdf), cdf)
Esempio n. 10
0
def _log_ndtr_asymptotic_series(x, series_order):
  """Calculates the asymptotic series used in log_ndtr."""
  npdt = dtype_util.as_numpy_dtype(x.dtype)
  if series_order <= 0:
    return npdt(1)
  x_2 = tf.square(x)
  even_sum = tf.zeros_like(x)
  odd_sum = tf.zeros_like(x)
  x_2n = x_2  # Start with x^{2*1} = x^{2*n} with n = 1.
  for n in range(1, series_order + 1):
    y = npdt(_double_factorial(2 * n - 1)) / x_2n
    if n % 2:
      odd_sum += y
    else:
      even_sum += y
    x_2n *= x_2
  return 1. + even_sum - odd_sum
Esempio n. 11
0
def cholesky_concat(chol, cols, name=None):
    """Concatenates `chol @ chol.T` with additional rows and columns.

  This operation is conceptually identical to:
  ```python
  def cholesky_concat_slow(chol, cols):  # cols shaped (n + m) x m = z x m
    mat = tf.matmul(chol, chol, adjoint_b=True)  # batch of n x n
    # Concat columns.
    mat = tf.concat([mat, cols[..., :tf.shape(mat)[-2], :]], axis=-1)  # n x z
    # Concat rows.
    mat = tf.concat([mat, tf.linalg.matrix_transpose(cols)], axis=-2)  # z x z
    return tf.linalg.cholesky(mat)
  ```
  but whereas `cholesky_concat_slow` would cost `O(z**3)` work,
  `cholesky_concat` only costs `O(z**2 + m**3)` work.

  The resulting (implicit) matrix must be symmetric and positive definite.
  Thus, the bottom right `m x m` must be self-adjoint, and we do not require a
  separate `rows` argument (which can be inferred from `conj(cols.T)`).

  Args:
    chol: Cholesky decomposition of `mat = chol @ chol.T`.
    cols: The new columns whose first `n` rows we would like concatenated to the
      right of `mat = chol @ chol.T`, and whose conjugate transpose we would
      like concatenated to the bottom of `concat(mat, cols[:n,:])`. A `Tensor`
      with final dims `(n+m, m)`. The first `n` rows are the top right rectangle
      (their conjugate transpose forms the bottom left), and the bottom `m x m`
      is self-adjoint.
    name: Optional name for this op.

  Returns:
    chol_concat: The Cholesky decomposition of:
      ```
      [ [ mat  cols[:n, :] ]
        [   conj(cols.T)   ] ]
      ```
  """
    with tf.name_scope(name or 'cholesky_extend'):
        dtype = dtype_util.common_dtype([chol, cols], dtype_hint=tf.float32)
        chol = tf.convert_to_tensor(chol, name='chol', dtype=dtype)
        cols = tf.convert_to_tensor(cols, name='cols', dtype=dtype)
        n = prefer_static.shape(chol)[-1]
        mat_nm, mat_mm = cols[..., :n, :], cols[..., n:, :]
        solved_nm = linear_operator_util.matrix_triangular_solve_with_broadcast(
            chol, mat_nm)
        lower_right_mm = tf.linalg.cholesky(
            mat_mm - tf.matmul(solved_nm, solved_nm, adjoint_a=True))
        lower_left_mn = tf.math.conj(tf.linalg.matrix_transpose(solved_nm))
        out_batch = prefer_static.shape(solved_nm)[:-2]
        chol = tf.broadcast_to(
            chol,
            tf.concat([out_batch, prefer_static.shape(chol)[-2:]], axis=0))
        top_right_zeros_nm = tf.zeros_like(solved_nm)
        return tf.concat([
            tf.concat([chol, top_right_zeros_nm], axis=-1),
            tf.concat([lower_left_mn, lower_right_mm], axis=-1)
        ],
                         axis=-2)
Esempio n. 12
0
 def _cdf(self, x):
     with tf.control_dependencies(self._maybe_assert_valid_sample(x)):
         probs = self._probs_parameter_no_checks()
         if not self.validate_args:
             # Whether or not x is integer-form, the following is well-defined.
             # However, scipy takes the floor, so we do too.
             x = tf.floor(x)
         return tf.where(x < 0., tf.zeros_like(x), -tf.math.expm1(
             (1. + x) * tf.math.log1p(-probs)))
Esempio n. 13
0
 def _log_prob(self, x):
     with tf.control_dependencies(self._maybe_assert_valid_sample(x)):
         probs = self._probs_parameter_no_checks()
         if not self.validate_args:
             # For consistency with cdf, we take the floor.
             x = tf.floor(x)
         safe_domain = tf.where(tf.equal(x, 0.), tf.zeros_like(probs),
                                probs)
         return x * tf.math.log1p(-safe_domain) + tf.math.log(probs)
Esempio n. 14
0
 def _prob(self, x):
     low = tf.convert_to_tensor(self.low)
     high = tf.convert_to_tensor(self.high)
     return tf.where(
         tf.math.is_nan(x),
         x,
         tf.where(
             # This > is only sound for continuous uniform
             (x < low) | (x > high),
             tf.zeros_like(x),
             tf.ones_like(x) / self._range(low=low, high=high)))
Esempio n. 15
0
 def _assertions(self, x):
   if not self.validate_args:
     return []
   shape = tf.shape(x)
   is_matrix = assert_util.assert_rank_at_least(
       x, 2, message="Input must have rank at least 2.")
   is_square = assert_util.assert_equal(
       shape[-2], shape[-1], message="Input must be a square matrix.")
   above_diagonal = tf.linalg.band_part(
       tf.linalg.set_diag(x, tf.zeros(shape[:-1], dtype=tf.float32)), 0, -1)
   is_lower_triangular = assert_util.assert_equal(
       above_diagonal,
       tf.zeros_like(above_diagonal),
       message="Input must be lower triangular.")
   # A lower triangular matrix is nonsingular iff all its diagonal entries are
   # nonzero.
   diag_part = tf.linalg.diag_part(x)
   is_nonsingular = assert_util.assert_none_equal(
       diag_part,
       tf.zeros_like(diag_part),
       message="Input must have all diagonal entries nonzero.")
   return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
Esempio n. 16
0
    def _cdf(self, x):
        # CDF(x) at positive integer x is the probability that the Zipf variable is
        # less than or equal to x; given by the formula:
        #     CDF(x) = 1 - (zeta(power, x + 1) / Z)
        # For fractional x, the CDF is equal to the CDF at n = floor(x).
        # For x < 1, the CDF is zero.

        # If interpolate_nondiscrete is True, we return a continuous relaxation
        # which agrees with the CDF at integer points.
        power = tf.convert_to_tensor(self.power)
        x = tf.cast(x, power.dtype)
        safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x),
                            0.)

        cdf = 1. - (tf.math.zeta(power, safe_x + 1.) / tf.math.zeta(power, 1.))
        return tf.where(x < 1., tf.zeros_like(cdf), cdf)
Esempio n. 17
0
 def _sample_n(self, n, seed=None):
     concentration = tf.convert_to_tensor(self.concentration)
     mixing_concentration = tf.convert_to_tensor(self.mixing_concentration)
     mixing_rate = tf.convert_to_tensor(self.mixing_rate)
     seed = SeedStream(seed, 'gamma_gamma')
     rate = tf.random.gamma(
         shape=[n],
         # Be sure to draw enough rates for the fully-broadcasted gamma-gamma.
         alpha=mixing_concentration + tf.zeros_like(concentration),
         beta=mixing_rate,
         dtype=self.dtype,
         seed=seed())
     return tf.random.gamma(shape=[],
                            alpha=concentration,
                            beta=rate,
                            dtype=self.dtype,
                            seed=seed())
Esempio n. 18
0
 def _sample_n(self, n, seed=None):
     # Need to create logits corresponding to [p, 1 - p].
     # Note that for this distributions, logits corresponds to
     # inverse sigmoid(p) while in multivariate distributions,
     # such as multinomial this corresponds to log(p).
     # Because of this, when we construct the logits for the multinomial
     # sampler, we'll have to be careful.
     # log(p) = log(sigmoid(logits)) = logits - softplus(logits)
     # log(1 - p) = log(1 - sigmoid(logits)) = -softplus(logits)
     # Because softmax is invariant to a constant shift in all inputs,
     # we can offset the logits by softplus(logits) so that we can use
     # [logits, 0.] as our input.
     orig_logits = self._logits_parameter_no_checks()
     logits = tf.stack([orig_logits, tf.zeros_like(orig_logits)], axis=-1)
     return multinomial.draw_sample(num_samples=n,
                                    num_classes=2,
                                    logits=logits,
                                    num_trials=tf.cast(self.total_count,
                                                       dtype=tf.int32),
                                    dtype=self.dtype,
                                    seed=seed)[..., 0]
Esempio n. 19
0
    def _cdf(self, x):
        low = tf.convert_to_tensor(self.low)
        high = tf.convert_to_tensor(self.high)
        peak = tf.convert_to_tensor(self.peak)

        interval_length = high - low
        # Due to the PDF being not smooth at the peak, we have to treat each side
        # somewhat differently. The PDF is two line segments, and thus we get
        # quadratics here for the CDF.
        result_inside_interval = tf.where(
            (x >= low) & (x <= peak),
            # (x - low) ** 2 / ((high - low) * (peak - low))
            tf.math.squared_difference(x, low) / (interval_length *
                                                  (peak - low)),
            # 1 - (high - x) ** 2 / ((high - low) * (high - peak))
            1. - tf.math.squared_difference(high, x) / (interval_length *
                                                        (high - peak)))

        # We now add that the left tail is 0 and the right tail is 1.
        result_if_not_big = tf.where(x < low, tf.zeros_like(x),
                                     result_inside_interval)

        return tf.where(x >= high, tf.ones_like(x), result_if_not_big)
Esempio n. 20
0
    def _prob(self, x):
        low = tf.convert_to_tensor(self.low)
        high = tf.convert_to_tensor(self.high)
        peak = tf.convert_to_tensor(self.peak)

        if self.validate_args:
            with tf.control_dependencies([
                    assert_util.assert_greater_equal(x, low),
                    assert_util.assert_less_equal(x, high)
            ]):
                x = tf.identity(x)

        interval_length = high - low
        # This is the pdf function when a low <= high <= x. This looks like
        # a triangle, so we have to treat each line segment separately.
        result_inside_interval = tf.where(
            (x >= low) & (x <= peak),
            # Line segment from (low, 0) to (peak, 2 / (high - low)).
            2. * (x - low) / (interval_length * (peak - low)),
            # Line segment from (peak, 2 / (high - low)) to (high, 0).
            2. * (high - x) / (interval_length * (high - peak)))

        return tf.where((x < low) | (x > high), tf.zeros_like(x),
                        result_inside_interval)
    def _observation_log_probs(self, observations, mask):
        """Compute and shape tensor of log probs associated with observations.."""

        # Let E be the underlying event shape
        #     M the number of steps in the HMM
        #     N the number of states of the HMM
        #
        # Then the incoming observations have shape
        #
        # observations : batch_o [M] E
        #
        # and the mask (if present) has shape
        #
        # mask : batch_m [M]
        #
        # Let this HMM distribution have batch shape batch_d
        # We need to broadcast all three of these batch shapes together
        # into the shape batch.
        #
        # We need to move the step dimension to the first dimension to make
        # them suitable for folding or scanning over.
        #
        # When we call `log_prob` for our observations we need to
        # do this for each state the observation could correspond to.
        # We do this by expanding the dimensions by 1 so we end up with:
        #
        # observations : [M] batch [1] [E]
        #
        # After calling `log_prob` we get
        #
        # observation_log_probs : [M] batch [N]
        #
        # We wish to use `mask` to select from this so we also
        # reshape and broadcast it up to shape
        #
        # mask : [M] batch [N]

        observation_tensor_shape = tf.shape(observations)
        observation_batch_shape = observation_tensor_shape[:-1 - self.
                                                           _underlying_event_rank]
        observation_event_shape = observation_tensor_shape[
            -1 - self._underlying_event_rank:]

        if mask is not None:
            mask_tensor_shape = tf.shape(mask)
            mask_batch_shape = mask_tensor_shape[:-1]

        batch_shape = tf.broadcast_dynamic_shape(observation_batch_shape,
                                                 self.batch_shape_tensor())

        if mask is not None:
            batch_shape = tf.broadcast_dynamic_shape(batch_shape,
                                                     mask_batch_shape)
        observations = tf.broadcast_to(
            observations,
            tf.concat([batch_shape, observation_event_shape], axis=0))
        observation_rank = tf.rank(observations)
        underlying_event_rank = self._underlying_event_rank
        observations = distribution_util.move_dimension(
            observations, observation_rank - underlying_event_rank - 1, 0)
        observations = tf.expand_dims(observations,
                                      observation_rank - underlying_event_rank)
        observation_log_probs = self._observation_distribution.log_prob(
            observations)

        if mask is not None:
            mask = tf.broadcast_to(
                mask, tf.concat([batch_shape, [self._num_steps]], axis=0))
            mask = distribution_util.move_dimension(mask, -1, 0)
            observation_log_probs = tf.where(
                mask[..., tf.newaxis], tf.zeros_like(observation_log_probs),
                observation_log_probs)

        return observation_log_probs
 def _mode(self):
     """The mode of the von Mises-Fisher distribution is the mean direction."""
     return (self.mean_direction +
             tf.zeros_like(self.concentration)[..., tf.newaxis])
 def _log_unnormalized_prob(self, samples):
     samples = self._maybe_assert_valid_sample(samples)
     bcast_mean_dir = (self.mean_direction +
                       tf.zeros_like(self.concentration)[..., tf.newaxis])
     inner_product = tf.reduce_sum(samples * bcast_mean_dir, axis=-1)
     return self.concentration * inner_product
 def _mode(self):
     total_count = tf.convert_to_tensor(self.total_count)
     adjusted_count = tf.where(1. < total_count, total_count - 1.,
                               tf.zeros_like(total_count))
     return tf.floor(adjusted_count *
                     tf.exp(self._logits_parameter_no_checks()))
Esempio n. 25
0
def pivoted_cholesky(matrix, max_rank, diag_rtol=1e-3, name=None):
    """Computes the (partial) pivoted cholesky decomposition of `matrix`.

  The pivoted Cholesky is a low rank approximation of the Cholesky decomposition
  of `matrix`, i.e. as described in [(Harbrecht et al., 2012)][1]. The
  currently-worst-approximated diagonal element is selected as the pivot at each
  iteration. This yields from a `[B1...Bn, N, N]` shaped `matrix` a `[B1...Bn,
  N, K]` shaped rank-`K` approximation `lr` such that `lr @ lr.T ~= matrix`.
  Note that, unlike the Cholesky decomposition, `lr` is not triangular even in
  a rectangular-matrix sense. However, under a permutation it could be made
  triangular (it has one more zero in each column as you move to the right).

  Such a matrix can be useful as a preconditioner for conjugate gradient
  optimization, i.e. as in [(Wang et al. 2019)][2], as matmuls and solves can be
  cheaply done via the Woodbury matrix identity, as implemented by
  `tf.linalg.LinearOperatorLowRankUpdate`.

  Args:
    matrix: Floating point `Tensor` batch of symmetric, positive definite
      matrices.
    max_rank: Scalar `int` `Tensor`, the rank at which to truncate the
      approximation.
    diag_rtol: Scalar floating point `Tensor` (same dtype as `matrix`). If the
      errors of all diagonal elements of `lr @ lr.T` are each lower than
      `element * diag_rtol`, iteration is permitted to terminate early.
    name: Optional name for the op.

  Returns:
    lr: Low rank pivoted Cholesky approximation of `matrix`.

  #### References

  [1]: H Harbrecht, M Peters, R Schneider. On the low-rank approximation by the
       pivoted Cholesky decomposition. _Applied numerical mathematics_,
       62(4):428-440, 2012.

  [2]: K. A. Wang et al. Exact Gaussian Processes on a Million Data Points.
       _arXiv preprint arXiv:1903.08114_, 2019. https://arxiv.org/abs/1903.08114
  """
    with tf.name_scope(name or 'pivoted_cholesky'):
        dtype = dtype_util.common_dtype([matrix, diag_rtol],
                                        dtype_hint=tf.float32)
        matrix = tf.convert_to_tensor(matrix, name='matrix', dtype=dtype)
        if tensorshape_util.rank(matrix.shape) is None:
            raise NotImplementedError(
                'Rank of `matrix` must be known statically')

        max_rank = tf.convert_to_tensor(max_rank,
                                        name='max_rank',
                                        dtype=tf.int64)
        max_rank = tf.minimum(
            max_rank,
            prefer_static.shape(matrix, out_type=tf.int64)[-1])
        diag_rtol = tf.convert_to_tensor(diag_rtol,
                                         dtype=dtype,
                                         name='diag_rtol')
        matrix_diag = tf.linalg.diag_part(matrix)
        # matrix is P.D., therefore all matrix_diag > 0, so we don't need abs.
        orig_error = tf.reduce_max(matrix_diag, axis=-1)

        def cond(m, pchol, perm, matrix_diag):
            """Condition for `tf.while_loop` continuation."""
            del pchol
            del perm
            error = tf.linalg.norm(matrix_diag, ord=1, axis=-1)
            max_err = tf.reduce_max(error / orig_error)
            return (m < max_rank) & (tf.equal(m, 0) | (max_err > diag_rtol))

        batch_dims = tensorshape_util.rank(matrix.shape) - 2

        def batch_gather(params, indices, axis=-1):
            return tf.gather(params, indices, axis=axis, batch_dims=batch_dims)

        def body(m, pchol, perm, matrix_diag):
            """Body of a single `tf.while_loop` iteration."""
            # Here is roughly a numpy, non-batched version of what's going to happen.
            # (See also Algorithm 1 of Harbrecht et al.)
            # 1: maxi = np.argmax(matrix_diag[perm[m:]]) + m
            # 2: maxval = matrix_diag[perm][maxi]
            # 3: perm[m], perm[maxi] = perm[maxi], perm[m]
            # 4: row = matrix[perm[m]][perm[m + 1:]]
            # 5: row -= np.sum(pchol[:m][perm[m + 1:]] * pchol[:m][perm[m]]], axis=-2)
            # 6: pivot = np.sqrt(maxval); row /= pivot
            # 7: row = np.concatenate([[[pivot]], row], -1)
            # 8: matrix_diag[perm[m:]] -= row**2
            # 9: pchol[m, perm[m:]] = row

            # Find the maximal position of the (remaining) permuted diagonal.
            # Steps 1, 2 above.
            permuted_diag = batch_gather(matrix_diag, perm[..., m:])
            maxi = tf.argmax(permuted_diag, axis=-1,
                             output_type=tf.int64)[..., tf.newaxis]
            maxval = batch_gather(permuted_diag, maxi)
            maxi = maxi + m
            maxval = maxval[..., 0]
            # Update perm: Swap perm[...,m] with perm[...,maxi]. Step 3 above.
            perm = _swap_m_with_i(perm, m, maxi)
            # Step 4.
            row = batch_gather(matrix, perm[..., m:m + 1], axis=-2)
            row = batch_gather(row, perm[..., m + 1:])
            # Step 5.
            prev_rows = pchol[..., :m, :]
            prev_rows_perm_m_onward = batch_gather(prev_rows, perm[...,
                                                                   m + 1:])
            prev_rows_pivot_col = batch_gather(prev_rows, perm[..., m:m + 1])
            row -= tf.reduce_sum(prev_rows_perm_m_onward * prev_rows_pivot_col,
                                 axis=-2)[..., tf.newaxis, :]
            # Step 6.
            pivot = tf.sqrt(maxval)[..., tf.newaxis, tf.newaxis]
            # Step 7.
            row = tf.concat([pivot, row / pivot], axis=-1)
            # TODO(b/130899118): Pad grad fails with int64 paddings.
            # Step 8.
            paddings = tf.concat([
                tf.zeros([prefer_static.rank(pchol) - 1, 2], dtype=tf.int32),
                [[tf.cast(m, tf.int32), 0]]
            ],
                                 axis=0)
            diag_update = tf.pad(row**2, paddings=paddings)[..., 0, :]
            reverse_perm = _invert_permutation(perm)
            matrix_diag -= batch_gather(diag_update, reverse_perm)
            # Step 9.
            row = tf.pad(row, paddings=paddings)
            # TODO(bjp): Defer the reverse permutation all-at-once at the end?
            row = batch_gather(row, reverse_perm)
            pchol_shape = pchol.shape
            pchol = tf.concat([pchol[..., :m, :], row, pchol[..., m + 1:, :]],
                              axis=-2)
            tensorshape_util.set_shape(pchol, pchol_shape)
            return m + 1, pchol, perm, matrix_diag

        m = np.int64(0)
        pchol = tf.zeros_like(matrix[..., :max_rank, :])
        matrix_shape = prefer_static.shape(matrix, out_type=tf.int64)
        perm = tf.broadcast_to(prefer_static.range(matrix_shape[-1]),
                               matrix_shape[:-1])
        _, pchol, _, _ = tf.while_loop(cond=cond,
                                       body=body,
                                       loop_vars=(m, pchol, perm, matrix_diag))
        pchol = tf.linalg.matrix_transpose(pchol)
        tensorshape_util.set_shape(
            pchol, tensorshape_util.concatenate(matrix_diag.shape, [None]))
        return pchol
Esempio n. 26
0
 def _create_polynomial(var, coeffs):
   """Compute n_th order polynomial via Horner's method."""
   coeffs = np.array(coeffs, dtype_util.as_numpy_dtype(var.dtype))
   if not coeffs.size:
     return tf.zeros_like(var)
   return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var
Esempio n. 27
0
 def _variance(self):
     return tf.zeros_like(self.loc)
Esempio n. 28
0
 def _mean(self):
     # Shape is broadcasted with + tf.zeros_like().
     return self.loc + tf.zeros_like(self.concentration)
    def posterior_marginals(self, observations, mask=None, name=None):
        """Compute marginal posterior distribution for each state.

    This function computes, for each time step, the marginal
    conditional probability that the hidden Markov model was in
    each possible state given the observations that were made
    at each time step.
    So if the hidden states are `z[0],...,z[num_steps - 1]` and
    the observations are `x[0], ..., x[num_steps - 1]`, then
    this function computes `P(z[i] | x[0], ..., x[num_steps - 1])`
    for all `i` from `0` to `num_steps - 1`.

    This operation is sometimes called smoothing. It uses a form
    of the forward-backward algorithm.

    Note: the behavior of this function is undefined if the
    `observations` argument represents impossible observations
    from the model.

    Args:
      observations: A tensor representing a batch of observations
        made on the hidden Markov model.  The rightmost dimension of this tensor
        gives the steps in a sequence of observations from a single sample from
        the hidden Markov model. The size of this dimension should match the
        `num_steps` parameter of the hidden Markov model object. The other
        dimensions are the dimensions of the batch and these are broadcast with
        the hidden Markov model's parameters.
      mask: optional bool-type `tensor` with rightmost dimension matching
        `num_steps` indicating which observations the result of this
        function should be conditioned on. When the mask has value
        `True` the corresponding observations aren't used.
        if `mask` is `None` then all of the observations are used.
        the `mask` dimensions left of the last are broadcast with the
        hmm batch as well as with the observations.
      name: Python `str` name prefixed to Ops created by this class.
        Default value: "HiddenMarkovModel".

    Returns:
      posterior_marginal: A `Categorical` distribution object representing the
        marginal probability of the hidden Markov model being in each state at
        each step. The rightmost dimension of the `Categorical` distributions
        batch will equal the `num_steps` parameter providing one marginal
        distribution for each step. The other dimensions are the dimensions
        corresponding to the batch of observations.

    Raises:
      ValueError: if rightmost dimension of `observations` does not
      have size `num_steps`.
    """

        with tf.name_scope(name or "posterior_marginals"):
            with tf.control_dependencies(self._runtime_assertions):
                observation_tensor_shape = tf.shape(observations)
                mask_tensor_shape = tf.shape(
                    mask) if mask is not None else None

                with self._observation_mask_shape_preconditions(
                        observation_tensor_shape, mask_tensor_shape):
                    observation_log_probs = self._observation_log_probs(
                        observations, mask)
                    log_prob = self._log_init + observation_log_probs[0]
                    log_transition = self._log_trans
                    log_adjoint_prob = tf.zeros_like(log_prob)

                    def _scan_multiple_steps_forwards():
                        def forward_step(log_previous_step,
                                         log_prob_observation):
                            return _log_vector_matrix(
                                log_previous_step,
                                log_transition) + log_prob_observation

                        forward_log_probs = tf.scan(forward_step,
                                                    observation_log_probs[1:],
                                                    initializer=log_prob,
                                                    name="forward_log_probs")
                        return tf.concat([[log_prob], forward_log_probs],
                                         axis=0)

                    forward_log_probs = prefer_static.cond(
                        self._num_steps > 1, _scan_multiple_steps_forwards,
                        lambda: tf.convert_to_tensor([log_prob]))

                    total_log_prob = tf.reduce_logsumexp(forward_log_probs[-1],
                                                         axis=-1)

                    def _scan_multiple_steps_backwards():
                        """Perform `scan` operation when `num_steps` > 1."""
                        def backward_step(log_previous_step,
                                          log_prob_observation):
                            return _log_matrix_vector(
                                log_transition,
                                log_prob_observation + log_previous_step)

                        backward_log_adjoint_probs = tf.scan(
                            backward_step,
                            observation_log_probs[1:],
                            initializer=log_adjoint_prob,
                            reverse=True,
                            name="backward_log_adjoint_probs")

                        return tf.concat(
                            [backward_log_adjoint_probs, [log_adjoint_prob]],
                            axis=0)

                    backward_log_adjoint_probs = prefer_static.cond(
                        self._num_steps > 1, _scan_multiple_steps_backwards,
                        lambda: tf.convert_to_tensor([log_adjoint_prob]))

                    log_likelihoods = forward_log_probs + backward_log_adjoint_probs

                    marginal_log_probs = distribution_util.move_dimension(
                        log_likelihoods - total_log_prob[..., tf.newaxis], 0,
                        -2)

                    return categorical.Categorical(logits=marginal_log_probs)
Esempio n. 30
0
    def _sample_n(self, num_samples, seed=None, name=None):
        """Returns a Tensor of samples from an LKJ distribution.

    Args:
      num_samples: Python `int`. The number of samples to draw.
      seed: Python integer seed for RNG
      name: Python `str` name prefixed to Ops created by this function.

    Returns:
      samples: A Tensor of correlation matrices with shape `[n, B, D, D]`,
        where `B` is the shape of the `concentration` parameter, and `D`
        is the `dimension`.

    Raises:
      ValueError: If `dimension` is negative.
    """
        if self.dimension < 0:
            raise ValueError(
                'Cannot sample negative-dimension correlation matrices.')
        # Notation below: B is the batch shape, i.e., tf.shape(concentration)
        seed = SeedStream(seed, 'sample_lkj')
        with tf.name_scope('sample_lkj' or name):
            concentration = tf.convert_to_tensor(self.concentration)
            if not dtype_util.is_floating(concentration.dtype):
                raise TypeError(
                    'The concentration argument should have floating type, not '
                    '{}'.format(dtype_util.name(concentration.dtype)))

            concentration = _replicate(num_samples, concentration)
            concentration_shape = tf.shape(concentration)
            if self.dimension <= 1:
                # For any dimension <= 1, there is only one possible correlation matrix.
                shape = tf.concat(
                    [concentration_shape, [self.dimension, self.dimension]],
                    axis=0)
                return tf.ones(shape=shape, dtype=concentration.dtype)
            beta_conc = concentration + (self.dimension - 2.) / 2.
            beta_dist = beta.Beta(concentration1=beta_conc,
                                  concentration0=beta_conc)

            # Note that the sampler below deviates from [1], by doing the sampling in
            # cholesky space. This does not change the fundamental logic of the
            # sampler, but does speed up the sampling.

            # This is the correlation coefficient between the first two dimensions.
            # This is also `r` in reference [1].
            corr12 = 2. * beta_dist.sample(seed=seed()) - 1.

            # Below we construct the Cholesky of the initial 2x2 correlation matrix,
            # which is of the form:
            # [[1, 0], [r, sqrt(1 - r**2)]], where r is the correlation between the
            # first two dimensions.
            # This is the top-left corner of the cholesky of the final sample.
            first_row = tf.concat([
                tf.ones_like(corr12)[..., tf.newaxis],
                tf.zeros_like(corr12)[..., tf.newaxis]
            ],
                                  axis=-1)
            second_row = tf.concat([
                corr12[..., tf.newaxis],
                tf.sqrt(1 - corr12**2)[..., tf.newaxis]
            ],
                                   axis=-1)

            chol_result = tf.concat([
                first_row[..., tf.newaxis, :], second_row[..., tf.newaxis, :]
            ],
                                    axis=-2)

            for n in range(2, self.dimension):
                # Loop invariant: on entry, result has shape B + [n, n]
                beta_conc = beta_conc - 0.5
                # norm is y in reference [1].
                norm = beta.Beta(concentration1=n / 2.,
                                 concentration0=beta_conc).sample(seed=seed())
                # distance shape: B + [1] for broadcast
                distance = tf.sqrt(norm)[..., tf.newaxis]
                # direction is u in reference [1].
                # direction shape: B + [n]
                direction = _uniform_unit_norm(n, concentration_shape,
                                               concentration.dtype, seed)
                # raw_correlation is w in reference [1].
                raw_correlation = distance * direction  # shape: B + [n]

                # This is the next row in the cholesky of the result,
                # which differs from the construction in reference [1].
                # In the reference, the new row `z` = chol_result @ raw_correlation^T
                # = C @ raw_correlation^T (where as short hand we use C = chol_result).
                # We prove that the below equation is the right row to add to the
                # cholesky, by showing equality with reference [1].
                # Let S be the sample constructed so far, and let `z` be as in
                # reference [1]. Then at this iteration, the new sample S' will be
                # [[S z^T]
                #  [z 1]]
                # In our case we have the cholesky decomposition factor C, so
                # we want our new row x (same size as z) to satisfy:
                #  [[S z^T]  [[C 0]    [[C^T  x^T]         [[CC^T  Cx^T]
                #   [z 1]] =  [x k]]    [0     k]]  =       [xC^t   xx^T + k**2]]
                # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible,
                # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k
                # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 -
                # distance**2).
                new_row = tf.concat(
                    [raw_correlation,
                     tf.sqrt(1. - norm[..., tf.newaxis])],
                    axis=-1)

                # Finally add this new row, by growing the cholesky of the result.
                chol_result = tf.concat([
                    chol_result,
                    tf.zeros_like(chol_result[..., 0][..., tf.newaxis])
                ],
                                        axis=-1)

                chol_result = tf.concat(
                    [chol_result, new_row[..., tf.newaxis, :]], axis=-2)

            if self.input_output_cholesky:
                return chol_result

            result = tf.matmul(chol_result, chol_result, transpose_b=True)
            # The diagonal for a correlation matrix should always be ones. Due to
            # numerical instability the matmul might not achieve that, so manually set
            # these to ones.
            result = tf.linalg.set_diag(
                result, tf.ones(shape=tf.shape(result)[:-1],
                                dtype=result.dtype))
            # This sampling algorithm can produce near-PSD matrices on which standard
            # algorithms such as `tf.cholesky` or `tf.linalg.self_adjoint_eigvals`
            # fail. Specifically, as documented in b/116828694, around 2% of trials
            # of 900,000 5x5 matrices (distributed according to 9 different
            # concentration parameter values) contained at least one matrix on which
            # the Cholesky decomposition failed.
            return result