Ejemplo n.º 1
0
    def _sample_n(self, n, seed=None):
        seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher')
        # The sampling strategy relies on the fact that vMF variates are symmetric
        # about the mean direction. Accordingly, if we have a sampling strategy for
        # the away-from-mean angle, then we can uniformly sample the remaining
        # dimensions on the S^{dim-2} sphere for , and rotate these samples from a
        # (1, 0, 0, ..., 0)-mode distribution into the target orientation.
        #
        # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a
        # von-Mises distributed `x` value in [-1, 1], then uniformly select what
        # amounts to a "up" or "down" additional degree of freedom after unit
        # normalizing, followed by a final rotation to the desired mean direction
        # from a basis of (1, 0).
        #
        # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the
        # unit sphere over which the distribution is uniform, in particular the
        # circle where x = \hat{x} intersects the unit sphere. We pick a point on
        # that circle, then rotate to the desired mean direction from a basis of
        # (1, 0, 0).
        event_dim = (tf.dimension_value(self.event_shape[0])
                     or self._event_shape_tensor()[0])

        sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()],
                                       axis=0)
        dim = tf.cast(event_dim - 1, self.dtype)
        if event_dim == 3:
            samples_dim0 = self._sample_3d(n, seed=seed)
        else:
            # Wood'94 provides a rejection algorithm to sample the x coordinate.
            # Wood'94 definition of b:
            # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim
            # https://stats.stackexchange.com/questions/156729 suggests:
            b = dim / (2 * self.concentration +
                       tf.sqrt(4 * self.concentration**2 + dim**2))
            # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE
            #     https://github.com/nicola-decao/s-vae-tf/
            x = (1 - b) / (1 + b)
            c = self.concentration * x + dim * tf.log1p(-x**2)
            beta = beta_lib.Beta(dim / 2, dim / 2)

            def cond_fn(w, should_continue):
                del w
                return tf.reduce_any(should_continue)

            def body_fn(w, should_continue):
                z = beta.sample(sample_shape=sample_batch_shape, seed=seed())
                w = tf.where(should_continue,
                             (1 - (1 + b) * z) / (1 - (1 - b) * z), w)
                w = tf.check_numerics(w, 'w')
                should_continue = tf.logical_and(
                    should_continue,
                    self.concentration * w + dim * tf.log1p(-x * w) - c <
                    tf.log(
                        tf.random_uniform(sample_batch_shape,
                                          seed=seed(),
                                          dtype=self.dtype)))
                return w, should_continue

            w = tf.zeros(sample_batch_shape, dtype=self.dtype)
            should_continue = tf.ones(sample_batch_shape, dtype=tf.bool)
            samples_dim0 = tf.while_loop(cond_fn, body_fn,
                                         (w, should_continue))[0]
            samples_dim0 = samples_dim0[..., tf.newaxis]
        if not self._allow_nan_stats:
            # Verify samples are w/in -1, 1, with useful error output tensors (top
            # value rather than all values).
            with tf.control_dependencies([
                    tf.assert_less_equal(
                        samples_dim0,
                        self.dtype.as_numpy_dtype(1.01),
                        data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]),
                    tf.assert_greater_equal(
                        samples_dim0,
                        self.dtype.as_numpy_dtype(-1.01),
                        data=[
                            -tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0]
                        ])
            ]):
                samples_dim0 = tf.identity(samples_dim0)
        samples_otherdims_shape = tf.concat(
            [sample_batch_shape, [event_dim - 1]], axis=0)
        unit_otherdims = tf.nn.l2_normalize(tf.random_normal(
            samples_otherdims_shape, seed=seed(), dtype=self.dtype),
                                            axis=-1)
        samples = tf.concat(
            [
                samples_dim0,  # we must avoid sqrt(1 - (>1)**2)
                tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims
            ],
            axis=-1)
        samples = tf.nn.l2_normalize(samples, axis=-1)
        if not self._allow_nan_stats:
            samples = tf.check_numerics(samples, 'samples')

        # Runtime assert that samples are unit length.
        if not self._allow_nan_stats:
            worst, idx = tf.nn.top_k(
                tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1]))
            with tf.control_dependencies([
                    tf.assert_near(self.dtype.as_numpy_dtype(0),
                                   worst,
                                   data=[
                                       worst, idx,
                                       tf.gather(
                                           tf.reshape(samples,
                                                      [-1, event_dim]), idx)
                                   ],
                                   atol=1e-4,
                                   summarize=100)
            ]):
                samples = tf.identity(samples)
        # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0).
        # Now, we move the mode to `self.mean_direction` using a rotation matrix.
        if not self._allow_nan_stats:
            # Assert that the basis vector rotates to the mean direction, as expected.
            basis = tf.cast(
                tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0),
                self.dtype)
            with tf.control_dependencies([
                    tf.assert_less(
                        tf.linalg.norm(self._rotate(basis) -
                                       self.mean_direction,
                                       axis=-1),
                        self.dtype.as_numpy_dtype(1e-5))
            ]):
                return self._rotate(samples)
        return self._rotate(samples)
Ejemplo n.º 2
0
    def testSampleMarginals(self):
        # Verify that the marginals of the LKJ distribution are distributed
        # according to a (scaled) Beta distribution. The LKJ distributed samples are
        # obtained by sampling a CholeskyLKJ distribution using HMC and the
        # CorrelationCholesky bijector.
        dim = 4
        concentration = np.array(2.5, dtype=np.float64)
        beta_concentration = np.array(.5 * dim + concentration - 1, np.float64)
        beta_dist = beta.Beta(concentration0=beta_concentration,
                              concentration1=beta_concentration)

        inner_kernel = hmc.HamiltonianMonteCarlo(
            target_log_prob_fn=cholesky_lkj.CholeskyLKJ(
                dimension=dim, concentration=concentration).log_prob,
            num_leapfrog_steps=3,
            step_size=0.3,
            seed=test_util.test_seed())

        kernel = transformed_kernel.TransformedTransitionKernel(
            inner_kernel=inner_kernel, bijector=tfb.CorrelationCholesky())

        num_chains = 10
        num_total_samples = 30000

        # Make sure that we have enough samples to catch a wrong sampler to within
        # a small enough discrepancy.
        self.assertLess(
            self.evaluate(
                st.min_num_samples_for_dkwm_cdf_test(discrepancy=0.04,
                                                     false_fail_rate=1e-9,
                                                     false_pass_rate=1e-9)),
            num_total_samples)

        @tf.function  # Ensure that MCMC sampling is done efficiently.
        def sample_mcmc_chain():
            return sample.sample_chain(
                num_results=num_total_samples // num_chains,
                num_burnin_steps=1000,
                current_state=tf.eye(dim,
                                     batch_shape=[num_chains],
                                     dtype=tf.float64),
                trace_fn=lambda _, pkr: pkr.inner_results.is_accepted,
                kernel=kernel,
                parallel_iterations=1)

        # Draw samples from the HMC chains.
        chol_lkj_samples, is_accepted = self.evaluate(sample_mcmc_chain())

        # Ensure that the per-chain acceptance rate is high enough.
        self.assertAllGreater(np.mean(is_accepted, axis=0), 0.8)

        # Transform from Cholesky LKJ samples to LKJ samples.
        lkj_samples = tf.matmul(chol_lkj_samples,
                                chol_lkj_samples,
                                adjoint_b=True)
        lkj_samples = tf.reshape(lkj_samples,
                                 shape=[num_total_samples, dim, dim])

        # Only look at the entries strictly below the diagonal which is achieved by
        # the OutputToUnconstrained bijector. Also scale the marginals from the
        # range [-1,1] to [0,1].
        scaled_lkj_samples = .5 * (
            OutputToUnconstrained().forward(lkj_samples) + 1)

        # Each of the off-diagonal marginals should be distributed according to a
        # Beta distribution.
        for i in range(dim * (dim - 1) // 2):
            self.evaluate(
                st.assert_true_cdf_equal_by_dkwm(scaled_lkj_samples[..., i],
                                                 cdf=beta_dist.cdf,
                                                 false_fail_rate=1e-9))
Ejemplo n.º 3
0
    def _sample_n(self, num_samples, seed=None, name=None):
        """Returns a Tensor of samples from an LKJ distribution.

    Args:
      num_samples: Python `int`. The number of samples to draw.
      seed: Python integer seed for RNG
      name: Python `str` name prefixed to Ops created by this function.

    Returns:
      samples: A Tensor of correlation matrices with shape `[n, B, D, D]`,
        where `B` is the shape of the `concentration` parameter, and `D`
        is the `dimension`.

    Raises:
      ValueError: If `dimension` is negative.
    """
        if self.dimension < 0:
            raise ValueError(
                'Cannot sample negative-dimension correlation matrices.')
        # Notation below: B is the batch shape, i.e., tf.shape(concentration)
        seed = seed_stream.SeedStream(seed, 'sample_lkj')
        with tf.name_scope('sample_lkj' or name):
            if not dtype_util.is_floating(self.concentration.dtype):
                raise TypeError(
                    'The concentration argument should have floating type, not '
                    '{}'.format(dtype_util.name(self.concentration.dtype)))

            concentration = _replicate(num_samples, self.concentration)
            concentration_shape = tf.shape(input=concentration)
            if self.dimension <= 1:
                # For any dimension <= 1, there is only one possible correlation matrix.
                shape = tf.concat(
                    [concentration_shape, [self.dimension, self.dimension]],
                    axis=0)
                return tf.ones(shape=shape, dtype=self.concentration.dtype)
            beta_conc = concentration + (self.dimension - 2.) / 2.
            beta_dist = beta.Beta(concentration1=beta_conc,
                                  concentration0=beta_conc)

            # Note that the sampler below deviates from [1], by doing the sampling in
            # cholesky space. This does not change the fundamental logic of the
            # sampler, but does speed up the sampling.

            # This is the correlation coefficient between the first two dimensions.
            # This is also `r` in reference [1].
            corr12 = 2. * beta_dist.sample(seed=seed()) - 1.

            # Below we construct the Cholesky of the initial 2x2 correlation matrix,
            # which is of the form:
            # [[1, 0], [r, sqrt(1 - r**2)]], where r is the correlation between the
            # first two dimensions.
            # This is the top-left corner of the cholesky of the final sample.
            first_row = tf.concat([
                tf.ones_like(corr12)[..., tf.newaxis],
                tf.zeros_like(corr12)[..., tf.newaxis]
            ],
                                  axis=-1)
            second_row = tf.concat([
                corr12[..., tf.newaxis],
                tf.sqrt(1 - corr12**2)[..., tf.newaxis]
            ],
                                   axis=-1)

            chol_result = tf.concat([
                first_row[..., tf.newaxis, :], second_row[..., tf.newaxis, :]
            ],
                                    axis=-2)

            for n in range(2, self.dimension):
                # Loop invariant: on entry, result has shape B + [n, n]
                beta_conc -= 0.5
                # norm is y in reference [1].
                norm = beta.Beta(concentration1=n / 2.,
                                 concentration0=beta_conc).sample(seed=seed())
                # distance shape: B + [1] for broadcast
                distance = tf.sqrt(norm)[..., tf.newaxis]
                # direction is u in reference [1].
                # direction shape: B + [n]
                direction = _uniform_unit_norm(n, concentration_shape,
                                               self.concentration.dtype, seed)
                # raw_correlation is w in reference [1].
                raw_correlation = distance * direction  # shape: B + [n]

                # This is the next row in the cholesky of the result,
                # which differs from the construction in reference [1].
                # In the reference, the new row `z` = chol_result @ raw_correlation^T
                # = C @ raw_correlation^T (where as short hand we use C = chol_result).
                # We prove that the below equation is the right row to add to the
                # cholesky, by showing equality with reference [1].
                # Let S be the sample constructed so far, and let `z` be as in
                # reference [1]. Then at this iteration, the new sample S' will be
                # [[S z^T]
                #  [z 1]]
                # In our case we have the cholesky decomposition factor C, so
                # we want our new row x (same size as z) to satisfy:
                #  [[S z^T]  [[C 0]    [[C^T  x^T]         [[CC^T  Cx^T]
                #   [z 1]] =  [x k]]    [0     k]]  =       [xC^t   xx^T + k**2]]
                # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible,
                # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k
                # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 -
                # distance**2).
                new_row = tf.concat(
                    [raw_correlation,
                     tf.sqrt(1. - norm[..., tf.newaxis])],
                    axis=-1)

                # Finally add this new row, by growing the cholesky of the result.
                chol_result = tf.concat([
                    chol_result,
                    tf.zeros_like(chol_result[..., 0][..., tf.newaxis])
                ],
                                        axis=-1)

                chol_result = tf.concat(
                    [chol_result, new_row[..., tf.newaxis, :]], axis=-2)

            if self.input_output_cholesky:
                return chol_result

            result = tf.matmul(chol_result, chol_result, transpose_b=True)
            # The diagonal for a correlation matrix should always be ones. Due to
            # numerical instability the matmul might not achieve that, so manually set
            # these to ones.
            result = tf.linalg.set_diag(
                result,
                tf.ones(shape=tf.shape(input=result)[:-1], dtype=result.dtype))
            # This sampling algorithm can produce near-PSD matrices on which standard
            # algorithms such as `tf.cholesky` or `tf.linalg.self_adjoint_eigvals`
            # fail. Specifically, as documented in b/116828694, around 2% of trials
            # of 900,000 5x5 matrices (distributed according to 9 different
            # concentration parameter values) contained at least one matrix on which
            # the Cholesky decomposition failed.
            return result
Ejemplo n.º 4
0
    if inspect.isclass(condition):
        condition = lambda distribution, cls=condition: isinstance(  # pylint: disable=g-long-lambda
            distribution, cls)
    ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn


# Default substitutions attempt to express distributions using the most
# flexible available parameterization.
# pylint: disable=g-long-lambda
register_asvi_substitution_rule(
    half_normal.HalfNormal, lambda dist: truncated_normal.TruncatedNormal(
        loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
register_asvi_substitution_rule(
    uniform.Uniform, lambda dist: shift.Shift(dist.low)
    (scale_lib.Scale(dist.high - dist.low)
     (beta.Beta(concentration0=tf.ones_like(dist.mean()), concentration1=1.))))
register_asvi_substitution_rule(
    exponential.Exponential,
    lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
register_asvi_substitution_rule(
    chi2.Chi2, lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))

# pylint: enable=g-long-lambda


# TODO(kateslin): Add support for models with prior+likelihood written as
# a single JointDistribution.
def build_asvi_surrogate_posterior(prior,
                                   mean_field=False,
                                   initial_prior_weight=0.5,
                                   seed=None,
Ejemplo n.º 5
0
 def testBetaProperty(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
     dist = beta_lib.Beta(a, b)
     self.assertEqual([1, 3], dist.concentration0.shape)
     self.assertAllClose(b, self.evaluate(dist.concentration0))
Ejemplo n.º 6
0
 def testLogPdfOnBoundaryIsFiniteWhenAlphaIsOne(self):
     b = [[0.01, 0.1, 1., 2], [5., 10., 2., 3]]
     pdf = self.evaluate(beta_lib.Beta(1., b).prob(0.))
     self.assertAllEqual(np.ones_like(pdf, dtype=np.bool), np.isfinite(pdf))
Ejemplo n.º 7
0
def sample_lkj(
    num_samples,
    dimension,
    concentration,
    cholesky_space=False,
    seed=None,
    name=None):
  """Returns a Tensor of samples from an LKJ distribution.

  Args:
    num_samples: Python `int`. The number of samples to draw.
    dimension: Python `int`. The dimension of correlation matrices.
    concentration: `Tensor` representing the concentration of the LKJ
      distribution.
    cholesky_space: Python `bool`. Whether to take samples from LKJ or
      Chol(LKJ).
    seed: PRNG seed; see `tfp.random.sanitize_seed` for details.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    samples: A Tensor of correlation matrices (or Cholesky factors of
      correlation matrices if `cholesky_space = True`) with shape
      `[n] + B + [D, D]`, where `B` is the shape of the `concentration`
      parameter, and `D` is the `dimension`.

  Raises:
    ValueError: If `dimension` is negative.
  """
  if dimension < 0:
    raise ValueError(
        'Cannot sample negative-dimension correlation matrices.')
  # Notation below: B is the batch shape, i.e., tf.shape(concentration)

  with tf.name_scope('sample_lkj' or name):
    concentration = tf.convert_to_tensor(concentration)
    if not dtype_util.is_floating(concentration.dtype):
      raise TypeError(
          'The concentration argument should have floating type, not '
          '{}'.format(dtype_util.name(concentration.dtype)))

    batch_shape = ps.concat([[num_samples], ps.shape(concentration)], axis=0)
    dtype = concentration.dtype
    if dimension <= 1:
      # For any dimension <= 1, there is only one possible correlation matrix.
      shape = ps.concat([batch_shape, [dimension, dimension]], axis=0)
      return tf.ones(shape=shape, dtype=dtype)

    # We need 1 seed for beta and 1 seed for tril_spherical_uniform.
    beta_seed, tril_spherical_uniform_seed = samplers.split_seed(
        seed, n=2, salt='sample_lkj')

    # Note that the sampler below deviates from [1], by doing the sampling in
    # cholesky space. This does not change the fundamental logic of the
    # sampler, but does speed up the sampling.
    # In addition, we also vectorize the computation to make the sampler
    # more feasible to use in problems where `dimension` is large.

    beta_conc = concentration + (dimension - 2.) / 2.
    dimension_range = np.arange(
        1., dimension, dtype=dtype_util.as_numpy_dtype(dtype))
    beta_conc1 = dimension_range / 2.
    beta_conc0 = beta_conc[..., tf.newaxis] - (dimension_range - 1) / 2.
    beta_dist = beta.Beta(concentration1=beta_conc1, concentration0=beta_conc0)
    # norm is y in reference [1].
    norm = beta_dist.sample(sample_shape=[num_samples], seed=beta_seed)
    # distance shape: B + [dimension - 1, 1] for broadcast
    distance = tf.sqrt(norm)[..., tf.newaxis]

    # direction is u in reference [1].
    # direction follows the spherical uniform distribution and will be stored
    # in a lower triangular matrix, hence it will have shape:
    # B + [dimension - 1, dimension - 1]
    direction = _tril_spherical_uniform(dimension - 1, batch_shape, dtype,
                                        tril_spherical_uniform_seed)

    # raw_correlation is w in reference [1].
    # shape: B + [dimension - 1, dimension - 1]
    raw_correlation = distance * direction

    # This is the rows in the cholesky of the result,
    # which differs from the construction in reference [1].
    # In the reference, the new row `z` = chol_result @ raw_correlation^T
    # = C @ raw_correlation^T (where as short hand we use C = chol_result).
    # We prove that the below equation is the right row to add to the
    # cholesky, by showing equality with reference [1].
    # Let S be the sample constructed so far, and let `z` be as in
    # reference [1]. Then at this iteration, the new sample S' will be
    # [[S z^T]
    #  [z 1]]
    # In our case we have the cholesky decomposition factor C, so
    # we want our new row x (same size as z) to satisfy:
    #  [[S z^T]  [[C 0]    [[C^T  x^T]         [[CC^T  Cx^T]
    #   [z 1]] =  [x k]]    [0     k]]  =       [xC^t   xx^T + k**2]]
    # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible,
    # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k
    # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 -
    # distance**2).
    paddings_prepend = [[0, 0]] * len(batch_shape)
    diag = tf.pad(
        tf.sqrt(1. - norm), paddings_prepend + [[1, 0]], constant_values=1.)
    chol_result = tf.pad(
        raw_correlation,
        paddings_prepend + [[1, 0], [0, 1]],
        constant_values=0.)
    chol_result = tf.linalg.set_diag(chol_result, diag)

    if cholesky_space:
      return chol_result

    result = tf.matmul(chol_result, chol_result, transpose_b=True)
    # The diagonal for a correlation matrix should always be ones. Due to
    # numerical instability the matmul might not achieve that, so manually set
    # these to ones.
    result = tf.linalg.set_diag(
        result, tf.ones(shape=ps.shape(result)[:-1], dtype=result.dtype))
    # This sampling algorithm can produce near-PSD matrices on which standard
    # algorithms such as `tf.linalg.cholesky` or
    # `tf.linalg.self_adjoint_eigvals` fail. Specifically, as documented in
    # b/116828694, around 2% of trials of 900,000 5x5 matrices (distributed
    # according to 9 different concentration parameter values) contained at
    # least one matrix on which the Cholesky decomposition failed.
    return result
Ejemplo n.º 8
0
 def testAlphaProperty(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
     dist = beta_lib.Beta(a, b)
     self.assertEqual([1, 3], dist.concentration1.get_shape())
     self.assertAllClose(a, self.evaluate(dist.concentration1))