Esempio n. 1
0
    def entropy(self, a, b):
        """Entropy of probability distribution.

        Parameters
        ----------
        a : tf.Tensor
            A n-D tensor with all elements constrained to :math:`a >
            0`.
        b : tf.Tensor
            A n-D tensor with all elements constrained to :math:`b >
            0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        a = tf.cast(tf.squeeze(a), dtype=tf.float32)
        b = tf.cast(tf.squeeze(b), dtype=tf.float32)
        if len(a.get_shape()) == 0:
            return tf.lbeta(tf.pack([a, b])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
        else:
            return tf.lbeta(tf.concat(1,
                         [tf.expand_dims(a, 1), tf.expand_dims(b, 1)])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
 def test_two_dimensional_arg(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=self._use_gpu):
         self.assertAllClose([0.5, 0.5],
                             tf.exp(tf.lbeta(x_one_half)).eval())
         self.assertEqual((2, ), tf.lbeta(x_one_half).get_shape())
Esempio n. 3
0
    def entropy(self, alpha):
        """Entropy of probability distribution.

        Parameters
        ----------
        alpha : tf.Tensor
            A n-D tensor with each :math:`\\alpha` constrained to
            :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(alpha)) - 1
        K = get_dims(alpha)[multivariate_idx]
        if multivariate_idx == 0:
            a = tf.reduce_sum(alpha)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha))
        else:
            a = tf.reduce_sum(alpha, multivariate_idx)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha), multivariate_idx)
 def test_two_dimensional_proper_shape(self):
   # Should evaluate to 1/2.
   x_one_half = [[2, 1.], [2, 1.]]
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllClose([0.5, 0.5], tf.exp(tf.lbeta(x_one_half)).eval())
     self.assertEqual((2,), tf.shape(tf.lbeta(x_one_half)).eval())
     self.assertEqual(tf.TensorShape([2]), tf.lbeta(x_one_half).get_shape())
Esempio n. 5
0
    def kl_divergence(self,
                      alpha,
                      alpha_prior,
                      i_perm=None,
                      wrt='Dirichlet-Marginals'):
        """
        Computes the KL divergence between the Kumaraswamy q distributions and the Dirichlet prior's Beta marginals.
        :param alpha: posterior approximation Dirichlet parameters
        :param alpha_prior: prior Dirichlet parameters
        :param i_perm: random permutation indices used during sampling procedure
        :param wrt: that which the KL divergence is with respect to, either Dirichlet marginal or Beta stick breaks
        :return: KL divergence of marginal Beta distributions of shape [batch size x K]
        """
        assert wrt in {'Dirichlet-Marginals', 'Beta-Sticks'}

        # apply permutation if one was provided
        if i_perm is not None:
            alpha_prior = self.__parameter_rank_check(alpha_prior)
            alpha_prior = tf.tile(alpha_prior, tf.stack(
                (tf.shape(alpha)[0], 1)))
            alpha = tf.batch_gather(alpha, i_perm)
            alpha_prior = tf.batch_gather(alpha_prior, i_perm)

        # take KL divergence w.r.t. to the Dirichlet's marginal Betas
        if wrt == 'Dirichlet-Marginals':

            # compute marginal q(pi; a', b') approximation parameters
            a_prime = self.__parameter_rank_check(alpha)
            b_prime = tf.reduce_sum(a_prime, axis=1, keepdims=True) - a_prime

            # compute marginal p(pi; a, b) prior parameters
            a_prior = self.__parameter_rank_check(alpha_prior)
            b_prior = tf.reduce_sum(a_prior, axis=1, keepdims=True) - a_prior

        # take KL divergence w.r.t. to the stick-breaking marginal Betas
        else:

            # compute marginal q(pi; a', b') approximation parameters
            a_prime, b_prime = self.__stick_break_parameters(alpha)

            # compute marginal p(pi; a, b) prior parameters
            a_prior, b_prior = self.__stick_break_parameters(alpha_prior)

        # KL-Divergence
        kl = (a_prime - a_prior) / a_prime * (-np.euler_gamma - tf.digamma(b_prime) - 1 / b_prime) \
            + (tf.log(a_prime * b_prime)) \
            + (tf.lbeta(tf.stack((a_prior, b_prior), axis=-1))) \
            - (b_prime - 1) / b_prime
        for m in range(1, self.M + 1):
            B = tf.exp(
                tf.lbeta(
                    tf.concat((tf.expand_dims(m / a_prime, axis=-1),
                               tf.expand_dims(b_prime, axis=-1)),
                              axis=-1)))
            kl += (b_prior - 1) * b_prime / (m + a_prime * b_prime) * B

        # sum over the dimensions
        kl = tf.reduce_sum(kl, axis=1)

        return kl
Esempio n. 6
0
 def _log_prob(self, counts):
   counts = self._maybe_assert_valid_sample(counts)
   ordered_prob = (
       tf.lbeta(self.concentration + counts)
       - tf.lbeta(self.concentration))
   return ordered_prob + distribution_util.log_combinations(
       self.total_count, counts)
 def test_empty_rank2_or_greater_input_gives_empty_output_dynamic_alloc(
         self):
     with self.test_session(use_gpu=self._use_gpu):
         ph = tf.placeholder(tf.float32)
         self.assertAllEqual([], tf.lbeta(ph).eval(feed_dict={ph: [[]]}))
         self.assertAllEqual([[]],
                             tf.lbeta(ph).eval(feed_dict={ph: [[[]]]}))
Esempio n. 8
0
    def entropy(self, alpha):
        """Entropy of probability distribution.

        Parameters
        ----------
        alpha : tf.Tensor
            A n-D tensor with each :math:`\\alpha` constrained to
            :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(alpha)) - 1
        K = get_dims(alpha)[multivariate_idx]
        if multivariate_idx == 0:
            a = tf.reduce_sum(alpha)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha))
        else:
            a = tf.reduce_sum(alpha, multivariate_idx)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha), multivariate_idx)
Esempio n. 9
0
    def logpdf(self, x, alpha):
        """Log of the probability density function.

        Parameters
        ----------
        x : tf.Tensor
            A n-D tensor for n > 1, where the inner (right-most)
            dimension represents the multivariate dimension.
        alpha : tf.Tensor
            A tensor of same shape as ``x``, and with each
            :math:`\\alpha` constrained to :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        x = tf.cast(x, dtype=tf.float32)
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(x)) - 1
        if multivariate_idx == 0:
            return -tf.lbeta(alpha) + tf.reduce_sum((alpha-1.0) * tf.log(x))
        else:
            return -tf.lbeta(alpha) + \
                   tf.reduce_sum((alpha-1.0) * tf.log(x), multivariate_idx)
Esempio n. 10
0
    def entropy(self, a, b):
        """Entropy of probability distribution.

        Parameters
        ----------
        a : tf.Tensor
            A n-D tensor with all elements constrained to :math:`a >
            0`.
        b : tf.Tensor
            A n-D tensor with all elements constrained to :math:`b >
            0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        a = tf.cast(tf.squeeze(a), dtype=tf.float32)
        b = tf.cast(tf.squeeze(b), dtype=tf.float32)
        if len(a.get_shape()) == 0:
            return tf.lbeta(tf.pack([a, b])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
        else:
            return tf.lbeta(tf.concat(1,
                         [tf.expand_dims(a, 1), tf.expand_dims(b, 1)])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
Esempio n. 11
0
    def logpdf(self, x, alpha):
        """Log of the probability density function.

        Parameters
        ----------
        x : tf.Tensor
            A n-D tensor for n > 1, where the inner (right-most)
            dimension represents the multivariate dimension.
        alpha : tf.Tensor
            A tensor of same shape as ``x``, and with each
            :math:`\\alpha` constrained to :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        x = tf.cast(x, dtype=tf.float32)
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(x)) - 1
        if multivariate_idx == 0:
            return -tf.lbeta(alpha) + tf.reduce_sum((alpha - 1.0) * tf.log(x))
        else:
            return -tf.lbeta(alpha) + \
                   tf.reduce_sum((alpha-1.0) * tf.log(x), multivariate_idx)
Esempio n. 12
0
 def test_one_dimensional_arg(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
     x_one_half = [2, 1.]
     with self.test_session(use_gpu=self._use_gpu):
         self.assertAllClose(1, tf.exp(tf.lbeta(x_one)).eval())
         self.assertAllClose(0.5, tf.exp(tf.lbeta(x_one_half)).eval())
         self.assertEqual([], tf.lbeta(x_one).get_shape())
Esempio n. 13
0
 def test_one_dimensional_arg(self):
   # Should evaluate to 1 and 1/2.
   x_one = [1, 1.]
   x_one_half = [2, 1.]
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllClose(1, tf.exp(tf.lbeta(x_one)).eval())
     self.assertAllClose(0.5, tf.exp(tf.lbeta(x_one_half)).eval())
     self.assertEqual([], tf.lbeta(x_one).get_shape())
Esempio n. 14
0
 def test_length_1_last_dimension_results_in_one(self):
   # If there is only one coefficient, the formula still works, and we get one
   # as the answer, always.
   x_a = [5.5]
   x_b = [0.1]
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllClose(1, tf.exp(tf.lbeta(x_a)).eval())
     self.assertAllClose(1, tf.exp(tf.lbeta(x_b)).eval())
     self.assertEqual((), tf.lbeta(x_a).get_shape())
Esempio n. 15
0
 def test_length_1_last_dimension_results_in_one(self):
     # If there is only one coefficient, the formula still works, and we get one
     # as the answer, always.
     x_a = [5.5]
     x_b = [0.1]
     with self.test_session(use_gpu=self._use_gpu):
         self.assertAllClose(1, tf.exp(tf.lbeta(x_a)).eval())
         self.assertAllClose(1, tf.exp(tf.lbeta(x_b)).eval())
         self.assertEqual((), tf.lbeta(x_a).get_shape())
Esempio n. 16
0
 def build_annot_KL(self):
     alpha_diff = self.alpha_tilde - self.alpha
     KL_annot = (tf.reduce_sum(
         tf.multiply(alpha_diff, tf.digamma(self.alpha_tilde))) -
                 tf.reduce_sum(
                     tf.digamma(tf.reduce_sum(self.alpha_tilde, 1)) *
                     tf.reduce_sum(alpha_diff, 1)) + tf.reduce_sum(
                         tf.lbeta(tf.matrix_transpose(self.alpha)) -
                         tf.lbeta(tf.matrix_transpose(self.alpha_tilde))))
     return KL_annot
Esempio n. 17
0
    def logp(self, Zs, X):
        memberships, edgecounts, notedgecounts = self.suffstats(Zs, X)
        lnprior = tf.reduce_sum([tf.lbeta(a+m) - tf.lbeta(a)
                                 for a, m in zip(self.alphas, memberships)])


        lnlink = tf.reduce_sum(tf.lbeta(tf.stack([self.a + edgecounts,
                                                  self.b + notedgecounts],
                                                  axis=2)) -
                               tf.lbeta(tf.stack([[[self.a]], [[self.b]]], axis=2)))
        return lnprior + lnlink
Esempio n. 18
0
    def logpdf(self, x, a, b):
        """Log of the probability density function.

        Parameters
        ----------
        x : tf.Tensor
            A n-D tensor.
        a : tf.Tensor
            A tensor of same shape as ``x``, and with all elements
            constrained to :math:`a > 0`.
        b : tf.Tensor
            A tensor of same shape as ``x``, and with all elements
            constrained to :math:`b > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        x = tf.cast(x, dtype=tf.float32)
        a = tf.cast(tf.squeeze(a), dtype=tf.float32)
        b = tf.cast(tf.squeeze(b), dtype=tf.float32)
        return (a - 1.0) * tf.log(x) + \
               (b - 1.0) * tf.log(1.0-x) - \
               tf.lbeta(tf.pack([a, b]))
Esempio n. 19
0
 def test_two_dimensional_arg_dynamic_alloc(self):
   # Should evaluate to 1/2.
   x_one_half = [[2, 1.], [2, 1.]]
   with self.test_session(use_gpu=self._use_gpu):
     ph = tf.placeholder(tf.float32)
     beta_ph = tf.exp(tf.lbeta(ph))
     self.assertAllClose([0.5, 0.5], beta_ph.eval(feed_dict={ph: x_one_half}))
Esempio n. 20
0
 def test_two_dimensional_arg_dynamic_alloc(self):
   # Should evaluate to 1/2.
   x_one_half = [[2, 1.], [2, 1.]]
   with self.test_session(use_gpu=self._use_gpu):
     ph = tf.placeholder(tf.float32)
     beta_ph = tf.exp(tf.lbeta(ph))
     self.assertAllClose([0.5, 0.5], beta_ph.eval(feed_dict={ph: x_one_half}))
Esempio n. 21
0
    def logpdf(self, x, a, b):
        """Log of the probability density function.

        Parameters
        ----------
        x : tf.Tensor
            A n-D tensor.
        a : tf.Tensor
            A tensor of same shape as ``x``, and with all elements
            constrained to :math:`a > 0`.
        b : tf.Tensor
            A tensor of same shape as ``x``, and with all elements
            constrained to :math:`b > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        x = tf.cast(x, dtype=tf.float32)
        a = tf.cast(tf.squeeze(a), dtype=tf.float32)
        b = tf.cast(tf.squeeze(b), dtype=tf.float32)
        return (a - 1.0) * tf.log(x) + \
               (b - 1.0) * tf.log(1.0-x) - \
               tf.lbeta(tf.pack([a, b]))
Esempio n. 22
0
 def _entropy(self):
     v = tf.ones(self.batch_shape_tensor(), dtype=self.dtype)[...,
                                                              tf.newaxis]
     u = v * self.df[..., tf.newaxis]
     beta_arg = tf.concat([u, v], -1) / 2.
     return (tf.log(tf.abs(self.scale)) + 0.5 * tf.log(self.df) +
             tf.lbeta(beta_arg) + 0.5 * (self.df + 1.) *
             (tf.digamma(0.5 * (self.df + 1.)) - tf.digamma(0.5 * self.df)))
Esempio n. 23
0
 def _moment(self, n):
   """Compute the n'th (uncentered) moment."""
   total_concentration = self.concentration1 + self.concentration0
   expanded_concentration1 = tf.ones_like(
       total_concentration, dtype=self.dtype) * self.concentration1
   expanded_concentration0 = tf.ones_like(
       total_concentration, dtype=self.dtype) * self.concentration0
   beta_arg0 = 1 + n / expanded_concentration1
   beta_arg = tf.stack([beta_arg0, expanded_concentration0], -1)
   log_moment = tf.log(expanded_concentration0) + tf.lbeta(beta_arg)
   return tf.exp(log_moment)
Esempio n. 24
0
 def _entropy(self):
   v = tf.ones(self.batch_shape_tensor(),
               dtype=self.dtype)[..., tf.newaxis]
   u = v * self.df[..., tf.newaxis]
   beta_arg = tf.concat([u, v], -1) / 2.
   return (tf.log(tf.abs(self.scale)) +
           0.5 * tf.log(self.df) +
           tf.lbeta(beta_arg) +
           0.5 * (self.df + 1.) *
           (tf.digamma(0.5 * (self.df + 1.)) -
            tf.digamma(0.5 * self.df)))
Esempio n. 25
0
 def _moment(self, n):
     """Compute the n'th (uncentered) moment."""
     total_concentration = self.concentration1 + self.concentration0
     expanded_concentration1 = tf.ones_like(
         total_concentration, dtype=self.dtype) * self.concentration1
     expanded_concentration0 = tf.ones_like(
         total_concentration, dtype=self.dtype) * self.concentration0
     beta_arg0 = 1 + n / expanded_concentration1
     beta_arg = tf.stack([beta_arg0, expanded_concentration0], -1)
     log_moment = tf.log(expanded_concentration0) + tf.lbeta(beta_arg)
     return tf.exp(log_moment)
Esempio n. 26
0
def KL(kuma_a, kuma_b, beta_a, beta_b, terms=10):
    """
    Here I give you an example of how to code the KL between Kumaraswamy and Beta distributions.

    In the theory notebook: Kuma(alpha, beta), Beta(a, b)
    Here: Kuma(kuma_a, kuma_b), Beta(beta_a, beta_b).

    I am assuming at this point you have already clipped the parameters of 
        Kuma and Beta between 0.001 and 10 (for example), 
        to make sure we do not have 0s and to make sure we do not have large numbers.

    Note that:
        kuma_a, kumba_b, beta_a and beta_b should all have the same shape
    
    I am not doing it here, but I suggest you clip the resulting KL from below at 0 
        this prevents your optimiser from opportunistically exploiting numerical instabilities 
        due to the truncated Taylor expansion

    I hope this helps :)
    """
    kl = (kuma_a - beta_a) / kuma_a * (-np.euler_gamma - tf.digamma(kuma_b) -
                                       1.0 / kuma_b)
    kl += tf.log(kuma_a * kuma_b) + tf.lbeta(
        tf.concat([tf.expand_dims(beta_a, -1),
                   tf.expand_dims(beta_b, -1)], -1))
    kl += -(kuma_b - 1) / kuma_b
    # A useful identity:
    #   B(a,b) = exp(log Gamma(a) + log Gamma(b) - log Gamma(a+b))
    # but here we simply exponentiate tf.lbeta instead, feel free to use whichever version you prefer
    betafn = lambda a, b: tf.exp(
        tf.lbeta(tf.concat([tf.expand_dims(a, -1),
                            tf.expand_dims(b, -1)], -1)))
    # Truncated Taylor expansion around 1
    taylor = tf.zeros(tf.shape(kuma_a))
    for m in range(1, terms +
                   1):  # m should start from 1 (otherwise betafn will be inf)!
        taylor += betafn(m / kuma_a, kuma_b) / (m + kuma_a * kuma_b)
    kl += (beta_b - 1) * kuma_b * taylor
    return kl
Esempio n. 27
0
    def batch_logp(self, Z, X, observed=None):
        #unpack from singleton list
        if len(Z.shape)==4:
            Z = Z[0]
        if observed is None:
            observed = self._defaultobserved
        else:
            observed = tf.convert_to_tensor(observed, dtype)
        membership = tf.reduce_sum(Z, axis=1, keepdims=True)
        edgecounts = tf.einsum('bmk,mn,bnl->bkl', Z, observed*X, Z) #Z^T*X*Z
        notedgecounts = tf.einsum('bmk,mn,bnl->bkl', Z, observed, Z) - edgecounts


        lnprior = tf.squeeze(tf.lbeta(self.alpha + membership) -
                             tf.lbeta(self.alpha + tf.zeros_like(membership)))
        lnlink = tf.reduce_sum(tf.lbeta(tf.stack([self.a + edgecounts,
                                                  self.b + notedgecounts],
                                                  axis=3)) -
                               tf.lbeta(tf.stack([self.a + tf.zeros_like(edgecounts),
                                                  self.b + tf.zeros_like(notedgecounts)],
                                                  axis=3)), axis=[1,2])
        return lnprior + lnlink
Esempio n. 28
0
 def _log_prob(self, given):
     given, alpha = maybe_explicit_broadcast(given, self.alpha, 'given',
                                             'alpha')
     lbeta_alpha = tf.lbeta(alpha)
     # fix of no static shape inference for tf.lbeta
     if alpha.get_shape():
         lbeta_alpha.set_shape(alpha.get_shape()[:-1])
     log_given = tf.log(given)
     if self._check_numerics:
         lbeta_alpha = tf.check_numerics(lbeta_alpha, "lbeta(alpha)")
         log_given = tf.check_numerics(log_given, "log(given)")
     log_p = -lbeta_alpha + tf.reduce_sum((alpha - 1) * log_given, -1)
     return log_p
Esempio n. 29
0
    def logp(self, Z, X, observed=None):
        #unpack from singleton list
        if len(Z.shape)==3:
            Z = Z[0]

        if observed is None:
            observed = tf.convert_to_tensor(np.triu(np.ones((self.N, self.N), dtype=dtype), 1))

        membership = tf.reduce_sum(Z, axis=0, keepdims=True)
        edgecounts = tf.einsum('mk,mn,nl', Z, observed*X, Z) #Z^T*X*Z
        notedgecounts = tf.einsum('mk,mn,nl', Z, observed, Z) - edgecounts

        lnprior = tf.reduce_sum(tf.lbeta((self.alpha +
                                                      membership)) -
                                tf.lbeta((self.alpha +
                                                      tf.zeros_like(membership))))
        lnlink = tf.reduce_sum(tf.lbeta(tf.stack([self.a + edgecounts,
                                                  self.b + notedgecounts],
                                                  axis=2)) -
                               tf.lbeta(tf.stack([self.a + tf.zeros_like(edgecounts),
                                                  self.b + tf.zeros_like(notedgecounts)],
                                                  axis=2)))
        return lnprior + lnlink
Esempio n. 30
0
    def kl(self, params_i: list, params_j: list):
        """
        KL(Kuma(a', b') || Beta(a, b))

        :param params_i: [kuma_a, kuma_b]
        :param params_j: [beta_a, beta_b]
        :return:
        """
        kuma_a, kuma_b = params_i
        beta_a, beta_b = params_j
        term1 = (kuma_a - beta_a) / kuma_a * (- np.euler_gamma - tf.digamma(kuma_b) - 1.0 / kuma_b)
        term1 += tf.log(kuma_a * kuma_b) + tf.lbeta(
            tf.concat([tf.expand_dims(beta_a, -1), tf.expand_dims(beta_b, -1)], -1))
        term1 += - (kuma_b - 1) / kuma_b
        # Truncated Taylor expansion around 1
        taylor = tf.zeros(tf.shape(kuma_a))
        for m in range(1, self._num_terms + 1):  # m should start from 1 (otherwise betafn will be inf)!
            taylor += tf_beta_fn(m / kuma_a, kuma_b) / (m + kuma_a * kuma_b)
        term2 = (beta_b - 1) * kuma_b * taylor
        return term1 + term2  # tf.maximum(0., term1 + term2)
Esempio n. 31
0
 def kl(self, other: 'Kuma'):
     # TODO: Kuma || Kuma rather than Kuma || Beta
     if not isinstance(other, Kuma):
         raise ValueError(
             'I expected another kuma distribution: KL[self || other]')
     kuma_a, kuma_b = self._alpha, self._beta
     beta_a, beta_b = other._alpha, other._beta
     term1 = (kuma_a - beta_a) / kuma_a * (
         -np.euler_gamma - tf.digamma(kuma_b) - 1.0 / kuma_b)
     term1 += tf.log(kuma_a * kuma_b) + tf.lbeta(
         tf.concat([tf.expand_dims(beta_a, -1),
                    tf.expand_dims(beta_b, -1)], -1))
     term1 += -(kuma_b - 1) / kuma_b
     # Truncated Taylor expansion around 1
     taylor = tf.zeros(tf.shape(kuma_a))
     for m in range(
             1, self._num_terms +
             1):  # m should start from 1 (otherwise betafn will be inf)!
         taylor += dist.tf_beta_fn(m / kuma_a,
                                   kuma_b) / (m + kuma_a * kuma_b)
     term2 = (beta_b - 1) * kuma_b * taylor
     return term1 + term2  # tf.maximum(0., term1 + term2)
Esempio n. 32
0
 def test_empty_rank2_or_greater_input_gives_empty_output(self):
     with self.test_session(use_gpu=self._use_gpu):
         self.assertAllEqual([], tf.lbeta([[]]).eval())
         self.assertEqual((0, ), tf.lbeta([[]]).get_shape())
         self.assertAllEqual([[]], tf.lbeta([[[]]]).eval())
         self.assertEqual((1, 0), tf.lbeta([[[]]]).get_shape())
Esempio n. 33
0
import tensorflow as tf

"""tf.lbeta(x,name=None)
功能:计算`ln(|Beta(x)|)`,并以最末尺度进行归纳。
          最末尺度`z = [z_0,...,z_{K-1}]`,则Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)
输入:x为秩为n+1的张量,可以为'float','double'类型。"""
x = tf.constant([[4, 3, 3], [2, 3, 2]], tf.float64)
z = tf.lbeta(x)

# ln(gamma(4)*gamma(3)*gamma(3)/gamma(4+3+3))=ln(6*2*2/362880)=-9.62377365
# ln(gamma(2)*gamma(3)*gamma(2)/gamma(2+3+2))=ln(2/720)=-5.88610403
sess = tf.Session()
print(sess.run(z))
sess.close()
# z==>[-9.62377365 -5.88610403]
# 这是beta函数的计算法,是以gama函数作为基础的所谓伽马函数即是(n-1)!
# 例如:gama(4)=3*2*1=6
Esempio n. 34
0
def _kl_dirichlet_dirichlet(d1, d2, name=None):
    """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet.

  Args:
    d1: instance of a Dirichlet distribution object.
    d2: instance of a Dirichlet distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_dirichlet_dirichlet".

  Returns:
    Batchwise KL(d1 || d2)
  """
    with tf.name_scope(name,
                       "kl_dirichlet_dirichlet",
                       values=[d1.concentration, d2.concentration]):
        # The KL between Dirichlet distributions can be derived as follows. We have
        #
        #   Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)]
        #
        # where B(a) is the multivariate Beta function:
        #
        #   B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n])
        #
        # The KL is
        #
        #   KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))}
        #
        # so we'll need to know the log density of the Dirichlet. This is
        #
        #   log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a)
        #
        # The only term that matters for the expectations is the log(x[i]). To
        # compute the expectation of this term over the Dirichlet density, we can
        # use the following facts about the Dirichlet in exponential family form:
        #   1. log(x[i]) is a sufficient statistic
        #   2. expected sufficient statistics (of any exp family distribution) are
        #      equal to derivatives of the log normalizer with respect to
        #      corresponding natural parameters: E{T[i](x)} = dA/d(eta[i])
        #
        # To proceed, we can rewrite the Dirichlet density in exponential family
        # form as follows:
        #
        #   Dir(x; a) = exp{eta(a) . T(x) - A(a)}
        #
        # where '.' is the dot product of vectors eta and T, and A is a scalar:
        #
        #   eta[i](a) = a[i] - 1
        #     T[i](x) = log(x[i])
        #        A(a) = log B(a)
        #
        # Now, we can use fact (2) above to write
        #
        #   E_Dir(x; a)[log(x[i])]
        #       = dA(a) / da[i]
        #       = d/da[i] log B(a)
        #       = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j])
        #       = digamma(a[i])) - digamma(sum_j a[j])
        #
        # Putting it all together, we have
        #
        # KL[Dir(x; a) || Dir(x; b)]
        #     = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)}
        #     = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b))
        #     = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b)
        #     = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))]
        #          - lbeta(a) + lbeta(b))

        digamma_sum_d1 = tf.digamma(
            tf.reduce_sum(d1.concentration, axis=-1, keepdims=True))
        digamma_diff = tf.digamma(d1.concentration) - digamma_sum_d1
        concentration_diff = d1.concentration - d2.concentration

        return (tf.reduce_sum(concentration_diff * digamma_diff, axis=-1) -
                tf.lbeta(d1.concentration) + tf.lbeta(d2.concentration))
Esempio n. 35
0
 def test_empty_rank1_input_raises_value_error(self):
     with self.test_session(use_gpu=self._use_gpu):
         with self.assertRaisesRegexp(ValueError, 'rank'):
             tf.lbeta([])
Esempio n. 36
0
def beta_fn(a, b):
    # A useful identity:
    #   B(a,b) = exp(log Gamma(a) + log Gamma(b) - log Gamma(a+b))
    # but here we simply exponentiate tf.lbeta instead, feel free to use whichever version you prefer
    return tf.exp(tf.lbeta(tf.concat([tf.expand_dims(a, -1), tf.expand_dims(b, -1)], -1)))
 def test_complicated_shape(self):
   with self.test_session(use_gpu=self._use_gpu):
     x = tf.convert_to_tensor(np.random.rand(3, 2, 2))
     self.assertAllEqual((3, 2), tf.shape(tf.lbeta(x)).eval())
     self.assertEqual(tf.TensorShape([3, 2]), tf.lbeta(x).get_shape())
Esempio n. 38
0
  def _build_model(self):
    """Builds the computational graph for our model."""

    x_embeddings = tf.get_variable(
      name="x_embeddings", initializer=tf.random_uniform_initializer(),
      shape=[self.x_vocabulary_size, self.emb_dim])
    y_embeddings = tf.get_variable(
        name="y_embeddings", initializer=tf.random_uniform_initializer(),
        shape=[self.y_vocabulary_size, self.emb_dim])

    batch_size = tf.shape(self.x)[0]
    longest_x = tf.shape(self.x)[1]  # longest M
    longest_y = tf.shape(self.y)[1]  # longest N

    x_embedded = tf.nn.embedding_lookup(x_embeddings, self.x)
    y_embedded = tf.nn.embedding_lookup(y_embeddings, self.y)
    padding = tf.zeros((batch_size, 1), dtype=tf.int32)
    padding = tf.nn.embedding_lookup(y_embeddings, padding)
    y_prev_embedded = tf.concat([padding, y_embedded[:, :-1, :]], axis=1)

    # ========== Getting a sample s_j for all f_j ============

    # Take a weighted average of the previous french word and the current french word.
    r_ff = (1.0 - self.prev_f_weight) * y_embedded + self.prev_f_weight * y_prev_embedded
    r_ff = tf.reshape(r_ff, [batch_size * longest_y, self.emb_dim])

    # Compute Kuramaswamy param alpha.
    ha = tf.matmul(r_ff, self.phi_W_ha) + self.phi_b_ha
    ha = tf.tanh(ha)
    alpha = tf.exp(tf.matmul(ha, self.phi_W_a) + self.phi_b_a)  # [B * N, 1]

    # Compute Kumaraswamy param beta.
    hb = tf.matmul(r_ff, self.phi_W_hb) + self.phi_b_hb    # affine transformation
    hb = tf.tanh(hb)                                       # non-linearity
    beta = tf.exp(tf.matmul(hb, self.phi_W_b) + self.phi_b_b)      # affine transformation [B * N, 1]

    # Numerical stability
    alpha = tf.clip_by_value(alpha, 0.001, 10)
    beta = tf.clip_by_value(beta, 0.001, 10)

    # Sample some random uniform numbers. Then calculate s using a and b
    # which is then Kumaraswamy distributed.
    u = tf.random_uniform(tf.shape(alpha), minval=0., maxval=1.)
    s = tf.pow((1.0 - tf.pow(u, tf.pow(beta + self.eps, -1))), tf.pow(alpha + self.eps, -1)) # [B * N, 1]

    # ========== Compute a and b for the Beta distribution. ===========
    y_prev_embedded = tf.reshape(y_prev_embedded, [batch_size * longest_y, self.emb_dim])
    ha = tf.matmul(y_prev_embedded, self.th_W_ha) + self.th_b_ha    # affine transformation
    ha = tf.tanh(ha)                                                # non-linearity
    a = tf.exp(tf.matmul(ha, self.th_W_a) + self.th_b_a)            # affine transformation [B * N, 1]

    # Compute Kumaraswamy param beta.
    hb = tf.matmul(y_prev_embedded, self.th_W_hb) + self.th_b_hb               # affine transformation
    hb = tf.tanh(hb)                                                # non-linearity
    b = tf.exp(tf.matmul(hb, self.th_W_b) + self.th_b_b)            # affine transformation [B * N, 1]

    # Numerical stability
    a = tf.clip_by_value(a, 0.001, 10)
    b = tf.clip_by_value(b, 0.001, 10)

    # Change s to the Beta mean if we're not training
    s = tf.cond(self.is_training, lambda: s, lambda: a / (a + b))

    x_mask = tf.cast(tf.sign(self.x), tf.float32)  # Shape: [B, M]
    y_mask = tf.cast(tf.sign(self.y), tf.float32)  # Shape: [B, N]
    x_len = tf.reduce_sum(tf.sign(self.x), axis=1)  # Shape: [B]
    y_len = tf.reduce_sum(tf.sign(self.y), axis=1)  # Shape: [B]

    lengths = tf.expand_dims(x_len, -1)  # Shape: [B, 1]
    pa_x = tf.div(x_mask, tf.cast(lengths, tf.float32))  # Shape: [B, M]
    pa_x = tf.expand_dims(pa_x, 2)  # Shape: [B, M, 1]
    pa_x = tf.expand_dims(pa_x, 3)  # Shape: [B, M, 1, 1]

    s = tf.reshape(s, [batch_size, longest_y, 1])
    s = tf.expand_dims(s, 1)
    s = tf.tile(s, [1, longest_x, 1, self.emb_dim])

    r_ff = tf.reshape(r_ff, [batch_size, longest_y, self.emb_dim])
    r_ff = tf.expand_dims(tf.tanh(r_ff), 1)  # [B, 1, N, emb]
    r_ff = tf.tile(r_ff, [1, longest_x, 1, 1])  # [B, M, N, emb]

    # expand source embeddings
    x_embedded = tf.expand_dims(tf.tanh(x_embedded), 2)  # [B, M, 1, emb]
    x_embedded = tf.tile(x_embedded, [1, 1, longest_y, 1])  # [B, M, N, emb]

    h = s * r_ff + (1. - s) * x_embedded

    h = tf.reshape(h, [batch_size * longest_x * longest_y, self.emb_dim])
    h = tf.matmul(h, self.mlp_W_ref) + self.mlp_b_ref        # affine transformation [B * M, N, Vy]
    h = tf.tanh(h)                                           # non-linearity
    h = tf.matmul(h, self.mlp_W_t) + self.mlp_b_t            # affine transformation [B * M, Vy]

    # Now we perform a softmax which operates on a per-row basis.
    py_xa = tf.nn.softmax(h)
    py_xa = tf.reshape(py_xa, [batch_size, longest_x, longest_y, self.y_vocabulary_size])
    py_x = tf.reduce_sum(tf.multiply(pa_x, py_xa), axis=1)

    # This calculates the accuracy, i.e. how many predictions we got right.
    predictions = tf.argmax(py_x, axis=2)
    acc = tf.equal(predictions, self.y)
    acc = tf.cast(acc, tf.float32) * y_mask
    acc_correct = tf.reduce_sum(acc)
    acc_total = tf.reduce_sum(y_mask)
    acc = acc_correct / acc_total

    # =========== KL Part ==============
    KL = ((alpha - a) / (alpha)) * (-np.euler_gamma - tf.digamma(beta) - (1.0 / beta))
    KL += tf.log(alpha * beta)
    KL += tf.lbeta(tf.concat([tf.expand_dims(a , -1), tf.expand_dims(b, -1)], axis=-1))
    KL -= (beta - 1.) / (beta)

    # Taylor approximation
    taylor_approx = tf.zeros(tf.shape(a))
    for m in range(1, 1 + 10):
        taylor_approx += (1.0 / (m + alpha * beta)) * tf.exp(tf.lbeta(tf.concat([tf.expand_dims(m/alpha, -1), \
                tf.expand_dims(beta, -1)], axis=-1)))
    KL += (b - 1.0) * beta * taylor_approx

    KL = tf.reshape(KL, [batch_size, longest_y])
    KL = tf.reduce_sum(KL * y_mask, axis=1)
    KL = tf.reduce_mean(KL, axis=0)
    KL = tf.maximum(KL, 0)
    self.KL = KL

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=tf.reshape(self.y, [-1]),
      logits=tf.log(tf.reshape(py_x,[batch_size * longest_y, self.y_vocabulary_size])),
      name="logits"
    )
    cross_entropy = tf.reshape(cross_entropy, [batch_size, longest_y])
    cross_entropy = tf.reduce_sum(cross_entropy * y_mask, axis=1)
    cross_entropy = tf.reduce_mean(cross_entropy, axis=0)

    ELBO = cross_entropy + KL

    self.pa_x = pa_x
    self.py_x = py_x
    self.py_xa = py_xa
    self.loss = ELBO
    self.predictions = predictions
    self.accuracy = acc
    self.accuracy_correct = tf.cast(acc_correct, tf.int64)
    self.accuracy_total = tf.cast(acc_total, tf.int64)
Esempio n. 39
0
 def test_empty_rank1_dynamic_alloc_input_raises_op_error(self):
   with self.test_session(use_gpu=self._use_gpu):
     ph = tf.placeholder(tf.float32)
     with self.assertRaisesOpError('rank'):
       tf.lbeta(ph).eval(feed_dict={ph: []})
Esempio n. 40
0
 def test_empty_rank1_dynamic_alloc_input_raises_op_error(self):
     with self.test_session(use_gpu=self._use_gpu):
         ph = tf.placeholder(tf.float32)
         with self.assertRaisesOpError('rank'):
             tf.lbeta(ph).eval(feed_dict={ph: []})
Esempio n. 41
0
 def test_complicated_shape(self):
     with self.test_session(use_gpu=self._use_gpu):
         x = tf.convert_to_tensor(np.random.rand(3, 2, 2))
         self.assertAllEqual((3, 2), tf.shape(tf.lbeta(x)).eval())
         self.assertEqual(tf.TensorShape([3, 2]), tf.lbeta(x).get_shape())
Esempio n. 42
0
def _kl_dirichlet_dirichlet(d1, d2, name=None):
  """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet.

  Args:
    d1: instance of a Dirichlet distribution object.
    d2: instance of a Dirichlet distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_dirichlet_dirichlet".

  Returns:
    Batchwise KL(d1 || d2)
  """
  with tf.name_scope(name, "kl_dirichlet_dirichlet", values=[
      d1.concentration, d2.concentration]):
    # The KL between Dirichlet distributions can be derived as follows. We have
    #
    #   Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)]
    #
    # where B(a) is the multivariate Beta function:
    #
    #   B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n])
    #
    # The KL is
    #
    #   KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))}
    #
    # so we'll need to know the log density of the Dirichlet. This is
    #
    #   log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a)
    #
    # The only term that matters for the expectations is the log(x[i]). To
    # compute the expectation of this term over the Dirichlet density, we can
    # use the following facts about the Dirichlet in exponential family form:
    #   1. log(x[i]) is a sufficient statistic
    #   2. expected sufficient statistics (of any exp family distribution) are
    #      equal to derivatives of the log normalizer with respect to
    #      corresponding natural parameters: E{T[i](x)} = dA/d(eta[i])
    #
    # To proceed, we can rewrite the Dirichlet density in exponential family
    # form as follows:
    #
    #   Dir(x; a) = exp{eta(a) . T(x) - A(a)}
    #
    # where '.' is the dot product of vectors eta and T, and A is a scalar:
    #
    #   eta[i](a) = a[i] - 1
    #     T[i](x) = log(x[i])
    #        A(a) = log B(a)
    #
    # Now, we can use fact (2) above to write
    #
    #   E_Dir(x; a)[log(x[i])]
    #       = dA(a) / da[i]
    #       = d/da[i] log B(a)
    #       = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j])
    #       = digamma(a[i])) - digamma(sum_j a[j])
    #
    # Putting it all together, we have
    #
    # KL[Dir(x; a) || Dir(x; b)]
    #     = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)}
    #     = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b))
    #     = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b)
    #     = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))]
    #          - lbeta(a) + lbeta(b))

    digamma_sum_d1 = tf.digamma(
        tf.reduce_sum(d1.concentration, axis=-1, keepdims=True))
    digamma_diff = tf.digamma(d1.concentration) - digamma_sum_d1
    concentration_diff = d1.concentration - d2.concentration

    return (tf.reduce_sum(concentration_diff * digamma_diff, axis=-1) -
            tf.lbeta(d1.concentration) +
            tf.lbeta(d2.concentration))
Esempio n. 43
0
 def test_empty_rank2_or_greater_input_gives_empty_output_dynamic_alloc(self):
   with self.test_session(use_gpu=self._use_gpu):
     ph = tf.placeholder(tf.float32)
     self.assertAllEqual([], tf.lbeta(ph).eval(feed_dict={ph: [[]]}))
     self.assertAllEqual([[]], tf.lbeta(ph).eval(feed_dict={ph: [[[]]]}))
Esempio n. 44
0
 def test_empty_rank2_or_greater_input_gives_empty_output(self):
   with self.test_session(use_gpu=self._use_gpu):
     self.assertAllEqual([], tf.lbeta([[]]).eval())
     self.assertEqual((0,), tf.lbeta([[]]).get_shape())
     self.assertAllEqual([[]], tf.lbeta([[[]]]).eval())
     self.assertEqual((1, 0), tf.lbeta([[[]]]).get_shape())
Esempio n. 45
0
 def test_empty_rank1_input_raises_value_error(self):
   with self.test_session(use_gpu=self._use_gpu):
     with self.assertRaisesRegexp(ValueError, 'rank'):
       tf.lbeta([])
Esempio n. 46
0
 def test_lbeta(self):
     t = tf.lbeta(self.random(4, 3))
     self.check(t)
Esempio n. 47
0
 def _log_normalization(self):
   return tf.lbeta(self.concentration)