Esempio n. 1
0
def compute_prob(dist, n_clusters, batch_size, gamma_0, curr_pop_nk):
    """Computes KL loss """
    likelihood = 1.0 / dist
    likelihood = likelihood / tf.reduce_sum(likelihood, axis=1, keepdims=True)

    prior_weights = compute_pi(curr_pop_pi_k=curr_pop_nk, n_clusters=n_clusters, batch_size=batch_size, gamma_0=gamma_0)

    p_mle = likelihood * curr_pop_nk
    p_mle = p_mle / tf.reduce_sum(p_mle, axis=1, keepdims=True)

    alpha_c = likelihood * curr_pop_nk
    alpha_c = tf.contrib.framework.sort(alpha_c, direction='DESCENDING')  ## sort in a descending order
    alpha_0 = tf.reduce_sum(alpha_c, axis=1, keepdims=True)

    beta_c = likelihood * prior_weights
    beta_c = tf.contrib.framework.sort(beta_c, direction='DESCENDING')  ## sort in a descending order
    beta_0 = tf.reduce_sum(beta_c, axis=1, keepdims=True)

    digamma_diff = tf.digamma(alpha_c) - tf.digamma(alpha_0)
    geometric_mean = tf.reduce_sum((alpha_c - beta_c) * digamma_diff, axis=1)

    conc_diff = tf.log(tf.lgamma(alpha_0)) - tf.log(tf.lgamma(beta_0))

    mean_diff = tf.reduce_sum(tf.lgamma(beta_c), axis=1) - tf.reduce_sum(tf.lgamma(alpha_c), axis=1)

    kl_loss = tf.reduce_mean(conc_diff + mean_diff + geometric_mean)
    idx = 0

    return p_mle, likelihood, kl_loss, tf.contrib.framework.sort(p_mle[idx])[
                                       n_clusters - 2:n_clusters], tf.contrib.framework.sort(
        likelihood[idx])[n_clusters - 2:n_clusters]
Esempio n. 2
0
def _kl_beta_beta(d1, d2, name=None):
    """Calculate the batchwise KL divergence KL(d1 || d2) with d1 and d2 Beta.

  Args:
    d1: instance of a Beta distribution object.
    d2: instance of a Beta distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_beta_beta".

  Returns:
    Batchwise KL(d1 || d2)
  """
    def delta(fn, is_property=True):
        fn1 = getattr(d1, fn)
        fn2 = getattr(d2, fn)
        return (fn2 - fn1) if is_property else (fn2() - fn1())

    with tf.name_scope(name,
                       "kl_beta_beta",
                       values=[
                           d1.concentration1,
                           d1.concentration0,
                           d1.total_concentration,
                           d2.concentration1,
                           d2.concentration0,
                           d2.total_concentration,
                       ]):
        return (delta("_log_normalization", is_property=False) -
                tf.digamma(d1.concentration1) * delta("concentration1") -
                tf.digamma(d1.concentration0) * delta("concentration0") +
                (tf.digamma(d1.total_concentration) *
                 delta("total_concentration")))
Esempio n. 3
0
def expected_log_pi(dir_standard_param):
    with tf.name_scope('dirichlet_expectation'):
        return tf.subtract(
            tf.digamma(dir_standard_param),
            tf.digamma(
                tf.reduce_sum(dir_standard_param, axis=-1, keep_dims=True)),
            name='expected_mixing_coeffs')
Esempio n. 4
0
 def _entropy(self):
   return (
       self._log_normalization()
       - (self.concentration1 - 1.) * tf.digamma(self.concentration1)
       - (self.concentration0 - 1.) * tf.digamma(self.concentration0)
       + ((self.total_concentration - 2.) *
          tf.digamma(self.total_concentration)))
Esempio n. 5
0
def simple_graph_edge_update(q_theta, q_beta, q_gam, q_omega, es_ind):
    """
    For occupied pair (i,j) w index m the update is:
    edge_param[i,j,:] <- E[log(theta[i,:])] + E[log(beta[j,:])] + E[log(gam[i])] + E[log(omega[i])]
    where edge_param is log(lam), the natural parameters of the truncated Poisson distribution

    Peak memory cost for this operation is approximately edges * K * (1 + 1/num_splits)
    Compute time scales linearly with num_splits
    """

    # E[log(X)]
    ltheta = tf.digamma(q_theta.concentration) - tf.log(q_theta.rate)
    lbeta = tf.digamma(q_beta.concentration) - tf.log(q_beta.rate)
    lgam = tf.digamma(q_gam.concentration) - tf.log(q_gam.rate)
    lomega = tf.digamma(q_omega.concentration) - tf.log(q_omega.rate)

    user_params = lgam + ltheta
    item_params = lomega + lbeta

    # for occupied pair (i,j) w index m we have oc_theta[m]=ltheta[i,:]
    oc_user_params = tf.gather(user_params, es_ind[:, 0])
    oc_item_params = tf.gather(item_params, es_ind[:, 1])
    edge_params = oc_user_params + oc_item_params

    return edge_params
    def get_loss(self):
        # First term
        dirichlet_expectation = tf.digamma(self.alpha_t) - tf.tile(tf.expand_dims(tf.digamma(self.alpha0), -1), [1, 1, self.n_classes])
        y = tf.one_hot(self.y, self.n_classes)
        self.loss_ent_unc = -self.aggregate(tf.reduce_sum(dirichlet_expectation * y, -1), self.s)

        # Second term
        self.loss_pp = self.aggregate(-tf.log(self.alpha0 + 1e-8), self.s)

        x = tf.tile(self.x, [self.n_samples, 1])
        tx = tf.tile(tf.expand_dims(self.tx, -1), [self.n_samples, 1, 1])
        ty = tf.tile(self.ty, [self.n_samples, 1])
        s = tf.tile(self.s, [self.n_samples])

        t_sample = ty * tf.random_uniform(tf.shape(ty), 0, 1)

        rnn_input = tf.concat([self.mark_embedding(x), tx], -1)
        h = self.rnn(rnn_input, s, reuse=True)

        log_alpha_t_sample, _, _, _ = self.Dirichlet(h, t_sample)
        prior = 0
        alpha_t_sample = tf.exp(log_alpha_t_sample - prior)

        alpha_0_sample = ty * tf.reduce_sum(alpha_t_sample, axis=-1)
        loss_pp = tf.reshape(alpha_0_sample, [self.n_samples] + get_shape(self.x))  # [10, B, S]
        loss_pp = tf.reduce_mean(loss_pp, 0)  # [10, B, S] -> [B, S]
        self.loss_pp += tf.reduce_mean(loss_pp)

        loss = self.loss_ent_unc + self.loss_pp

        return loss
Esempio n. 7
0
def tf_dirichlet_expectation(alpha):
    if len(alpha.get_shape()) == 1:
        return tf.subtract(tf.digamma(tf.add(alpha,
                                             np.finfo(np.float32).eps)),
                           tf.digamma(tf.reduce_sum(alpha)))
    return tf.subtract(tf.digamma(alpha),
                       tf.digamma(tf.reduce_sum(alpha, 1))[:, tf.newaxis])
Esempio n. 8
0
 def _statistic(self, statistic, name):
     if statistic == '_alpha_total':
         return tf.reduce_sum(self._alpha, -1, True, name=name)
     elif statistic == '_alpha_totalm1':
         return tf.reduce_sum(self._alpha - 1.0, -1, True, name=name)
     elif statistic == 1:
         return tf.divide(self._alpha, self.statistic('_alpha_total'), name)
     elif statistic == 2:
         return tf.add(tf.square(self.statistic(1)), self.statistic('var'), name)
     elif statistic == 'var':
         _alpha_total = self.statistic('_alpha_total')
         return tf.divide(self._alpha * (_alpha_total - self._alpha),
                          tf.square(_alpha_total) * (_alpha_total + 1.0), name)
     elif statistic == 'log':
         return tf.subtract(tf.digamma(self._alpha), tf.digamma(self.statistic('_alpha_total')),
                            name)
     elif statistic == '_log_normalization':
         return tf.subtract(tf.reduce_sum(tf.lgamma(self._alpha), -1),
                            tf.lgamma(self.statistic('_alpha_total')[..., 0]), name)
     elif statistic == 'entropy':
         return tf.add(self.statistic('_log_normalization'),
                       self.statistic('_alpha_totalm1')[..., 0] *
                       tf.digamma(self.statistic('_alpha_total')[..., 0]) -
                       tf.reduce_sum((self._alpha - 1.0) * tf.digamma(self._alpha), -1), name)
     else:
         return super(DirichletDistribution, self)._statistic(statistic, name)
Esempio n. 9
0
    def entropy(self, alpha):
        """Entropy of probability distribution.

        Parameters
        ----------
        alpha : tf.Tensor
            A n-D tensor with each :math:`\\alpha` constrained to
            :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(alpha)) - 1
        K = get_dims(alpha)[multivariate_idx]
        if multivariate_idx == 0:
            a = tf.reduce_sum(alpha)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha))
        else:
            a = tf.reduce_sum(alpha, multivariate_idx)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha), multivariate_idx)
Esempio n. 10
0
    def _weight_hessian_ab(
        self,
        X,
        loc,
        scale,
    ):
        one_minus_loc = 1 - loc
        loc_times_scale = loc * scale
        one_minus_loc_times_scale = one_minus_loc * scale
        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)

        if isinstance(X, tf.SparseTensor):
            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
            const1 = tf.sparse_add(
                tf.zeros_like(loc),
                X).__div__(-tf.sparse.add(X, -tf.ones_like(loc)))
            # Adding tf1.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
            # to_dense does not work.
        else:
            const1 = tf.log(X / (1 - X))

        const2 = -tf.digamma(loc_times_scale) + tf.digamma(
            one_minus_loc_times_scale) + const1
        const3 = scale * (-tf.polygamma(scalar_one, loc_times_scale) * loc +
                          one_minus_loc *
                          tf.polygamma(scalar_one, one_minus_loc_times_scale))

        const = loc * one_minus_loc_times_scale * (const2 + const3)

        return const
Esempio n. 11
0
    def kl_divergence(self, other):
        assert isinstance(other, Beta)

        return other.log_norm - self.log_norm - tf.digamma(
            self.beta) * (other.beta - self.beta) - tf.digamma(
                self.alpha) * (other.alpha - self.alpha) + tf.digamma(
                    self.sum) * (other.sum - self.sum)
Esempio n. 12
0
def compute_log_pi(alpha_k):
    # Bishop eq 10.66
    with tf.name_scope('compute_log_pi'):
        alpha_hat = tf.reduce_sum(alpha_k)
        return tf.subtract(tf.digamma(alpha_k),
                           tf.digamma(alpha_hat),
                           name='log_pi')
Esempio n. 13
0
def _kl_beta_beta(d1, d2, name=None):
  """Calculate the batchwise KL divergence KL(d1 || d2) with d1 and d2 Beta.

  Args:
    d1: instance of a Beta distribution object.
    d2: instance of a Beta distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_beta_beta".

  Returns:
    Batchwise KL(d1 || d2)
  """
  def delta(fn, is_property=True):
    fn1 = getattr(d1, fn)
    fn2 = getattr(d2, fn)
    return (fn2 - fn1) if is_property else (fn2() - fn1())
  with tf.name_scope(name, "kl_beta_beta", values=[
      d1.concentration1,
      d1.concentration0,
      d1.total_concentration,
      d2.concentration1,
      d2.concentration0,
      d2.total_concentration,
  ]):
    return (delta("_log_normalization", is_property=False)
            - tf.digamma(d1.concentration1) * delta("concentration1")
            - tf.digamma(d1.concentration0) * delta("concentration0")
            + (tf.digamma(d1.total_concentration)
               * delta("total_concentration")))
Esempio n. 14
0
    def entropy(self, alpha):
        """Entropy of probability distribution.

        Parameters
        ----------
        alpha : tf.Tensor
            A n-D tensor with each :math:`\\alpha` constrained to
            :math:`\\alpha_i > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of one dimension less than the input.
        """
        alpha = tf.cast(alpha, dtype=tf.float32)
        multivariate_idx = len(get_dims(alpha)) - 1
        K = get_dims(alpha)[multivariate_idx]
        if multivariate_idx == 0:
            a = tf.reduce_sum(alpha)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha))
        else:
            a = tf.reduce_sum(alpha, multivariate_idx)
            return tf.lbeta(alpha) + \
                   (a - K) * tf.digamma(a) - \
                   tf.reduce_sum((alpha-1.0) * tf.digamma(alpha), multivariate_idx)
Esempio n. 15
0
 def _entropy(self):
     k = tf.cast(self.event_shape_tensor()[0], self.dtype)
     return (self._log_normalization() +
             ((self.total_concentration - k) *
              tf.digamma(self.total_concentration)) -
             tf.reduce_sum(
                 (self.concentration - 1.) * tf.digamma(self.concentration),
                 axis=-1))
Esempio n. 16
0
def dirichlet_expectation(alpha):
    """
    Dirichlet expectation computation
    \Psi(\alpha_{k}) - \Psi(\sum_{i=1}^{K}(\alpha_{i}))
    """
    return tf.subtract(tf.digamma(tf.add(alpha,
                                         np.finfo(np.float32).eps)),
                       tf.digamma(tf.reduce_sum(alpha)))
Esempio n. 17
0
def KL(alpha, K):
    beta = tf.constant(np.ones((1, K)), dtype=tf.float32)
    S_alpha = tf.reduce_sum(alpha, axis=1, keepdims=True)

    KL = tf.reduce_sum((alpha - beta) * (tf.digamma(alpha) - tf.digamma(S_alpha)), axis=1, keepdims=True) + \
         tf.lgamma(S_alpha) - tf.reduce_sum(tf.lgamma(alpha), axis=1, keepdims=True) + \
         tf.reduce_sum(tf.lgamma(beta), axis=1, keepdims=True) - tf.lgamma(tf.reduce_sum(beta, axis=1, keepdims=True))
    return KL
Esempio n. 18
0
 def _entropy(self):
     v = tf.ones(self.batch_shape_tensor(), dtype=self.dtype)[...,
                                                              tf.newaxis]
     u = v * self.df[..., tf.newaxis]
     beta_arg = tf.concat([u, v], -1) / 2.
     return (tf.log(tf.abs(self.scale)) + 0.5 * tf.log(self.df) +
             tf.lbeta(beta_arg) + 0.5 * (self.df + 1.) *
             (tf.digamma(0.5 * (self.df + 1.)) - tf.digamma(0.5 * self.df)))
Esempio n. 19
0
 def _entropy(self):
   k = tf.cast(self.event_shape_tensor()[0], self.dtype)
   return (
       self._log_normalization()
       + ((self.total_concentration - k)
          * tf.digamma(self.total_concentration))
       - tf.reduce_sum(
           (self.concentration - 1.) * tf.digamma(self.concentration),
           axis=-1))
Esempio n. 20
0
 def call(self, inputs, **kwargs):
     # Tensorflow needs to be imported here so that the saved model can be loaded again
     import tensorflow as tf
     alpha, beta, alpha_beta = inputs
     log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma(
         x=alpha_beta)
     entropy = log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) \
         + (alpha_beta - 2.0) * tf.digamma(x=alpha_beta)
     return tf.reduce_mean(entropy) * self.entropy_loss_bonus
Esempio n. 21
0
 def to_mu(self, alpha):
     """
     :param alpha: (Tensor)
     :return: (Tensor) mu
     """
     d1 = self.latent_dim - 1
     digamma_d = T.digamma(alpha[:, -1:])
     mu = T.digamma(alpha[:, :d1]) - digamma_d
     return mu
Esempio n. 22
0
 def get_layer_KL(number):
     a = self.layers[number].a
     b = self.layers[number].b
     term_1 = tf.divide(-b + 1, b + 1e-20)
     term_2 = tf.log(
         tf.divide(tf.multiply(a, b), alpha + 1e-20) + 1e-20)
     term_bracket = (tf.digamma(1.) - tf.digamma(b) -
                     tf.divide(1., b + 1e-20))
     term_3 = tf.multiply(tf.divide(a - alpha, a + 1e-20), term_bracket)
     return tf.reduce_sum(term_1 + term_2 + term_3)
Esempio n. 23
0
 def build_annot_KL(self):
     alpha_diff = self.alpha_tilde - self.alpha
     KL_annot = (tf.reduce_sum(
         tf.multiply(alpha_diff, tf.digamma(self.alpha_tilde))) -
                 tf.reduce_sum(
                     tf.digamma(tf.reduce_sum(self.alpha_tilde, 1)) *
                     tf.reduce_sum(alpha_diff, 1)) + tf.reduce_sum(
                         tf.lbeta(tf.matrix_transpose(self.alpha)) -
                         tf.lbeta(tf.matrix_transpose(self.alpha_tilde))))
     return KL_annot
Esempio n. 24
0
        def vae_loss(x, x_star):
            reconstruction_error = (self.original_dim *
                                    metrics.binary_crossentropy(x, x_star))

            a0 = K.sum(alpha, axis=-1, keepdims=True)
            kl = (T.lgamma(a0) - K.sum(T.lgamma(alpha), axis=-1) +
                  K.sum(alpha * T.digamma(alpha), axis=-1) -
                  K.sum(alpha * T.digamma(a0), axis=-1) -
                  K.mean(T.digamma(alpha) - T.digamma(a0), axis=-1))
            return reconstruction_error + kl
def weight_loss(neighb_count: np.array, labels: np.array,
                weights: list) -> float:
    """Calculates loss for given neighbors and weights.

    Parameters
    ----------
    neighb_count : numpy array
        describes for each point the number of neighbors with each label.
        Shape: (number of data points, number of labels)

    labels : numpy array
        label for each point. Shape: (number of data points, )

    weights : list
        weight of each label. Length: number of labels

    Returns
    -------
    float
        calculated loss
    """
    # reset graph before each run
    tf.reset_default_graph()

    num_data, num_labels = neighb_count.shape
    label_counts = np.zeros([num_labels])
    for label, count in Counter(labels).most_common():
        label_counts[label] = count

    # neighbors matrix
    neigh_matx = tf.constant(neighb_count, dtype=tf.float32)

    # label count vector
    label_cnts = tf.constant(label_counts, dtype=tf.float32)

    # weights
    w = tf.constant(weights, dtype=tf.float32)

    # weight lookup list
    w_list = tf.reduce_sum(tf.one_hot(labels, num_labels) * w, axis=1)

    # label counts lookup list
    label_cnts_list = tf.reduce_sum(tf.one_hot(labels, num_labels) *
                                    label_cnts,
                                    axis=1)
    nx = w * num_data

    ny = label_cnts_list / w_list * \
         tf.reduce_sum(neigh_matx * (w/label_cnts), axis=1)

    loss = (tf.reduce_sum(tf.digamma(nx) * w) \
          + tf.reduce_sum(tf.digamma(ny) * w_list / label_cnts_list))

    with tf.Session() as sess:
        return sess.run(loss)
Esempio n. 26
0
def loss_eq4(p, alpha, K, global_step, annealing_step):
    loglikelihood = tf.reduce_mean(
        tf.reduce_sum(
            p * (tf.digamma(tf.reduce_sum(alpha, axis=1, keepdims=True)) -
                 tf.digamma(alpha)),
            1,
            keepdims=True))
    KL_reg = tf.minimum(1.0, tf.cast(global_step / annealing_step,
                                     tf.float32)) * KL(
                                         (alpha - 1) * (1 - p) + 1, K)
    return loglikelihood + KL_reg
Esempio n. 27
0
def elbo_mf(a1, be1, a2, be2, a, b, c, d, x, crt, N):
    Elogr = tf.digamma(a1) - tf.log(be1)
    Elogp = tf.digamma(a2) - tf.digamma(a2 + be2)
    Elog1_p = tf.digamma(be2) - tf.digamma(a2 + be2)
    log_B = tf.lgamma(a2) + tf.lgamma(be2) - tf.lgamma(a2 + be2)
    term1 = a1 * tf.log(be1) - tf.lgamma(a1) - log_B
    term2 = (a2 - c - tf.reduce_sum(x)) * Elogp + (a1 - a -
                                                   tf.reduce_sum(crt)) * Elogr
    term3 = -(be1 - b) * (a1 / be1) + (be2 - d) * Elog1_p - N * (a1 /
                                                                 be1) * Elog1_p
    return term1 + term2 + term3
Esempio n. 28
0
 def _entropy(self):
   v = tf.ones(self.batch_shape_tensor(),
               dtype=self.dtype)[..., tf.newaxis]
   u = v * self.df[..., tf.newaxis]
   beta_arg = tf.concat([u, v], -1) / 2.
   return (tf.log(tf.abs(self.scale)) +
           0.5 * tf.log(self.df) +
           tf.lbeta(beta_arg) +
           0.5 * (self.df + 1.) *
           (tf.digamma(0.5 * (self.df + 1.)) -
            tf.digamma(0.5 * self.df)))
Esempio n. 29
0
def KL(alpha,outputSize):
    beta=tf.constant(np.ones((1,outputSize)),dtype=tf.float32)
    S_alpha = tf.reduce_sum(alpha,axis=1,keep_dims=True)
    S_beta = tf.reduce_sum(beta,axis=1,keep_dims=True)
    lnB = tf.lgamma(S_alpha) - tf.reduce_sum(tf.lgamma(alpha),axis=1,keep_dims=True)
    lnB_uni = tf.reduce_sum(tf.lgamma(beta),axis=1,keep_dims=True) - tf.lgamma(S_beta)
    
    dg0 = tf.digamma(S_alpha)
    dg1 = tf.digamma(alpha)
    
    kl = tf.reduce_sum((alpha - beta)*(dg1-dg0),axis=1,keep_dims=True) + lnB + lnB_uni
    return kl
Esempio n. 30
0
def loss_EDL(p, alpha, global_step, annealing_step, outputSize):
    S = tf.reduce_sum(alpha, axis=1, keep_dims=True) 
    E = alpha - 1

    A = tf.reduce_mean(tf.reduce_sum(p * (tf.digamma(S) - tf.digamma(alpha)),1, keepdims=True))

    annealing_coef = tf.minimum(1.00,tf.cast(global_step/annealing_step,tf.float32))

    alp = E*(1-p) + 1 
    B =  annealing_coef * KL(alp,outputSize)

    return (A + B)
Esempio n. 31
0
def entropy(alpha, beta):
    """
    This function calculates the entropy of the beta distribution parameterised by the shape parameters alpha and beta.
    :param alpha: The shape parameter alpha, which must be a positive scalar.
    :param beta: The shape parameter beta, which must be a positive scalar.
    :return: The entropy value of the beta distribution parameterised by alpha and beta.
    """

    total_concentration = alpha + beta

    return tf.lgamma(alpha) + tf.lgamma(beta) - tf.lgamma(total_concentration) - (alpha - 1.0) * tf.digamma(alpha) - \
           (beta - 1.0) * tf.digamma(beta) + (total_concentration - 2.0) * tf.digamma(total_concentration)
 def body(gtm1, gt, t, phi_dtm1):
     exp_E_log_theta_d = tf.exp(
         tf.digamma(gt) - tf.digamma(tf.reduce_sum(gt)))
     phi_dt = tf.ones(phi_d_shape) * exp_E_log_theta_d
     phi_dt *= exp_E_log_beta_d
     phinorm = tf.matmul(
         exp_E_log_beta_d,
         tf.expand_dims(exp_E_log_theta_d, axis=-1)) + 1e-6
     phi_dt /= phinorm
     gtp1 = a + tf.reduce_sum(phi_dt, axis=0)
     gtp1.set_shape([self.K])
     phi_dt.set_shape([None, self.K])
     return gt, gtp1, t + 1, phi_dt
    def _compute_elbo(self, gammas, phi, data):
        self._log_lik = 0.0
        kl_thetas = []
        kl_zs = []
        for i, (p, d) in enumerate(zip(phi, data)):
            g = gammas[i]

            # Data log-likelihood:
            lambdas = tf.nn.softmax(self.lambdas)
            lambdas = tf.clip_by_value(lambdas, 1e-2, 1-1e-2)
            word_proportions = tf.gather(
                tf.transpose(lambdas, [1, 0]), d)
            word_proportions = tf.expand_dims(word_proportions, -1)
            p = tf.expand_dims(p, 1)
            log_lik = tf.matmul(p, tf.log(word_proportions))[:, 0, 0]
            log_lik = tf.reduce_sum(log_lik, axis=0)
            self._log_lik += log_lik

            # KL[q(z|phi) || p(z|theta)]
            E_log_theta = tf.digamma(g) - tf.digamma(tf.reduce_sum(g))
            p = tf.clip_by_value(p, 1e-3, 1 - 1e-3)
            kl_z = tf.reduce_sum((tf.log(p) - E_log_theta) * p)
            kl_zs.append(kl_z)

            # KL[q(theta|gamma) || q(theta|alpha)]
            a = self.alpha[i]
            kl_theta_d = tf.lgamma(tf.reduce_sum(g))
            kl_theta_d -= tf.reduce_sum(tf.lgamma(g))
            kl_theta_d -= tf.lgamma(tf.reduce_sum(a))
            kl_theta_d += tf.reduce_sum(tf.lgamma(a))
            kl_theta_d += tf.reduce_sum((g - a) * E_log_theta)
            kl_thetas.append(kl_theta_d)

        # KL[q(beta|lambda) || p(beta|eta)]
        E_log_beta = tf.digamma(self.lambdas)
        E_log_beta -= tf.digamma(tf.reduce_sum(
            self.lambdas, axis=1, keep_dims=True))
        kl_beta = tf.lgamma(tf.reduce_sum(self.lambdas, axis=1))
        kl_beta -= tf.reduce_sum(tf.lgamma(self.lambdas), axis=1)
        kl_beta -= tf.lgamma(tf.reduce_sum(self.eta, axis=1))
        kl_beta += tf.reduce_sum(tf.lgamma(self.eta), axis=1)
        kl_beta += tf.reduce_sum((self.lambdas-self.eta)*E_log_beta, axis=1)
        kl_beta = tf.reduce_sum(kl_beta, axis=0)

        self._kl_terms = OrderedDict(
            kl_z=tf.reduce_sum(kl_zs, axis=0),
            kl_beta=kl_beta,
            kl_theta=tf.reduce_sum(kl_thetas, axis=0),
        )
        kl_list = list(six.itervalues(self._kl_terms))
        self._elbo = self._log_lik - tf.reduce_sum(kl_list)
def beta_kl_divergence(sample, prior_alpha, prior_beta):
    mu = tf.math.reduce_mean(sample)
    var = tf.reduce_mean(tf.squared_difference(sample, mu)) + EPSILON
    observed_alpha = ((1. - mu) / var - (1. / (mu + EPSILON))) * tf.square(mu)
    observed_beta = observed_alpha * (1. / (mu + EPSILON) - 1)
    return (tf.lgamma(prior_alpha + prior_beta) -
            (tf.lgamma(prior_alpha) + tf.lgamma(prior_beta)) -
            (tf.lgamma(observed_alpha + observed_beta + EPSILON)) +
            (tf.lgamma(observed_alpha + EPSILON) +
             tf.lgamma(observed_beta + EPSILON)) +
            (prior_alpha - observed_alpha) *
            (tf.digamma(prior_alpha) - tf.digamma(prior_alpha + prior_beta)) +
            (prior_beta - observed_beta) *
            (tf.digamma(prior_beta) - tf.digamma(prior_alpha + prior_beta)))
def masked_cross_entropy_dirichlet(preds, labels, mask):
    """Softmax cross-entropy loss with masking."""
    preds = preds + tf.constant(1.0)
    S = tf.reduce_sum(preds, axis=1)
    S = tf.reshape(S, [-1, 1])
    # prob = tf.div(preds, S)
    s_digmma = tf.digamma(S)
    loss = labels * (s_digmma - tf.digamma(preds))
    loss = tf.reduce_sum(loss, axis=1)
    # loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    loss *= mask
    return tf.reduce_mean(loss)
Esempio n. 36
0
def _harmonic_number(x):
  """Compute the harmonic number from its analytic continuation.

  Derivation from [here](
  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers)
  and [Euler's constant](
  https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant).

  Args:
    x: input float.

  Returns:
    z: The analytic continuation of the harmonic number for the input.
  """
  one = tf.ones([], dtype=x.dtype)
  return tf.digamma(x + one) - tf.digamma(one)
Esempio n. 37
0
    def entropy(self, a, b):
        """Entropy of probability distribution.

        Parameters
        ----------
        a : tf.Tensor
            A n-D tensor with all elements constrained to :math:`a >
            0`.
        b : tf.Tensor
            A n-D tensor with all elements constrained to :math:`b >
            0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        a = tf.cast(tf.squeeze(a), dtype=tf.float32)
        b = tf.cast(tf.squeeze(b), dtype=tf.float32)
        if len(a.get_shape()) == 0:
            return tf.lbeta(tf.pack([a, b])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
        else:
            return tf.lbeta(tf.concat(1,
                         [tf.expand_dims(a, 1), tf.expand_dims(b, 1)])) - \
                   (a - 1.0) * tf.digamma(a) - \
                   (b - 1.0) * tf.digamma(b) + \
                   (a + b - 2.0) * tf.digamma(a+b)
Esempio n. 38
0
  def _chain_gets_correct_expectations(self, x, independent_chain_ndims):
    counter = collections.Counter()
    def log_gamma_log_prob(x):
      counter['target_calls'] += 1
      event_dims = tf.range(independent_chain_ndims, tf.rank(x))
      return self._log_gamma_log_prob(x, event_dims)

    samples, kernel_results = tfp.mcmc.sample_chain(
        num_results=150,
        current_state=x,
        kernel=tfp.mcmc.HamiltonianMonteCarlo(
            target_log_prob_fn=log_gamma_log_prob,
            step_size=0.05,
            num_leapfrog_steps=2,
            seed=_set_seed(42)),
        num_burnin_steps=150,
        parallel_iterations=1)

    if tf.executing_eagerly():
      # TODO(b/79991421): Figure out why this is approx twice as many as it
      # should be. I.e., `expected_calls = (150 + 150) * 2 + 1`.
      expected_calls = 1202
    else:
      expected_calls = 2
    self.assertAllEqual(dict(target_calls=expected_calls), counter)

    expected_x = (tf.digamma(self._shape_param)
                  - np.log(self._rate_param))

    expected_exp_x = self._shape_param / self._rate_param

    log_accept_ratio_, samples_, expected_x_ = self.evaluate(
        [kernel_results.log_accept_ratio, samples, expected_x])

    actual_x = samples_.mean()
    actual_exp_x = np.exp(samples_).mean()
    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))

    tf.logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
        expected_x_, expected_exp_x))
    tf.logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
        actual_x, actual_exp_x))
    self.assertNear(actual_x, expected_x_, 2e-2)
    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
                        acceptance_probs > 0.5)
    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
                        acceptance_probs <= 1.)
Esempio n. 39
0
    def entropy(self, a, scale=1):
        """Entropy of probability distribution.

        Parameters
        ----------
        a : tf.Tensor
            **Shape** parameter. A n-D tensor with all elements
            constrained to :math:`a > 0`.
        scale : tf.Tensor
            **Scale** parameter. A n-D tensor with all elements
            constrained to :math:`scale > 0`.

        Returns
        -------
        tf.Tensor
            A tensor of same shape as input.
        """
        a = tf.cast(a, dtype=tf.float32)
        scale = tf.cast(scale, dtype=tf.float32)
        return a + tf.log(scale*tf.exp(tf.lgamma(a))) - \
               (1.0 + a) * tf.digamma(a)
Esempio n. 40
0
def _kl_dirichlet_dirichlet(d1, d2, name=None):
  """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet.

  Args:
    d1: instance of a Dirichlet distribution object.
    d2: instance of a Dirichlet distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_dirichlet_dirichlet".

  Returns:
    Batchwise KL(d1 || d2)
  """
  with tf.name_scope(name, "kl_dirichlet_dirichlet", values=[
      d1.concentration, d2.concentration]):
    # The KL between Dirichlet distributions can be derived as follows. We have
    #
    #   Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)]
    #
    # where B(a) is the multivariate Beta function:
    #
    #   B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n])
    #
    # The KL is
    #
    #   KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))}
    #
    # so we'll need to know the log density of the Dirichlet. This is
    #
    #   log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a)
    #
    # The only term that matters for the expectations is the log(x[i]). To
    # compute the expectation of this term over the Dirichlet density, we can
    # use the following facts about the Dirichlet in exponential family form:
    #   1. log(x[i]) is a sufficient statistic
    #   2. expected sufficient statistics (of any exp family distribution) are
    #      equal to derivatives of the log normalizer with respect to
    #      corresponding natural parameters: E{T[i](x)} = dA/d(eta[i])
    #
    # To proceed, we can rewrite the Dirichlet density in exponential family
    # form as follows:
    #
    #   Dir(x; a) = exp{eta(a) . T(x) - A(a)}
    #
    # where '.' is the dot product of vectors eta and T, and A is a scalar:
    #
    #   eta[i](a) = a[i] - 1
    #     T[i](x) = log(x[i])
    #        A(a) = log B(a)
    #
    # Now, we can use fact (2) above to write
    #
    #   E_Dir(x; a)[log(x[i])]
    #       = dA(a) / da[i]
    #       = d/da[i] log B(a)
    #       = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j])
    #       = digamma(a[i])) - digamma(sum_j a[j])
    #
    # Putting it all together, we have
    #
    # KL[Dir(x; a) || Dir(x; b)]
    #     = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)}
    #     = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b))
    #     = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b)
    #     = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))]
    #          - lbeta(a) + lbeta(b))

    digamma_sum_d1 = tf.digamma(
        tf.reduce_sum(d1.concentration, axis=-1, keepdims=True))
    digamma_diff = tf.digamma(d1.concentration) - digamma_sum_d1
    concentration_diff = d1.concentration - d2.concentration

    return (tf.reduce_sum(concentration_diff * digamma_diff, axis=-1) -
            tf.lbeta(d1.concentration) +
            tf.lbeta(d2.concentration))
Esempio n. 41
0
 def _multi_digamma(self, a, p, name="multi_digamma"):
   """Computes the multivariate digamma function; Psi_p(a)."""
   with self._name_scope(name, values=[a, p]):
     seq = self._multi_gamma_sequence(a, p)
     return tf.reduce_sum(tf.digamma(seq), axis=[-1])
Esempio n. 42
0
 def _entropy(self):
   return (self.concentration + tf.log(self.rate) + tf.lgamma(
       self.concentration) - (
           (1. + self.concentration) * tf.digamma(self.concentration)))