def compute_prob(dist, n_clusters, batch_size, gamma_0, curr_pop_nk): """Computes KL loss """ likelihood = 1.0 / dist likelihood = likelihood / tf.reduce_sum(likelihood, axis=1, keepdims=True) prior_weights = compute_pi(curr_pop_pi_k=curr_pop_nk, n_clusters=n_clusters, batch_size=batch_size, gamma_0=gamma_0) p_mle = likelihood * curr_pop_nk p_mle = p_mle / tf.reduce_sum(p_mle, axis=1, keepdims=True) alpha_c = likelihood * curr_pop_nk alpha_c = tf.contrib.framework.sort(alpha_c, direction='DESCENDING') ## sort in a descending order alpha_0 = tf.reduce_sum(alpha_c, axis=1, keepdims=True) beta_c = likelihood * prior_weights beta_c = tf.contrib.framework.sort(beta_c, direction='DESCENDING') ## sort in a descending order beta_0 = tf.reduce_sum(beta_c, axis=1, keepdims=True) digamma_diff = tf.digamma(alpha_c) - tf.digamma(alpha_0) geometric_mean = tf.reduce_sum((alpha_c - beta_c) * digamma_diff, axis=1) conc_diff = tf.log(tf.lgamma(alpha_0)) - tf.log(tf.lgamma(beta_0)) mean_diff = tf.reduce_sum(tf.lgamma(beta_c), axis=1) - tf.reduce_sum(tf.lgamma(alpha_c), axis=1) kl_loss = tf.reduce_mean(conc_diff + mean_diff + geometric_mean) idx = 0 return p_mle, likelihood, kl_loss, tf.contrib.framework.sort(p_mle[idx])[ n_clusters - 2:n_clusters], tf.contrib.framework.sort( likelihood[idx])[n_clusters - 2:n_clusters]
def _kl_beta_beta(d1, d2, name=None): """Calculate the batchwise KL divergence KL(d1 || d2) with d1 and d2 Beta. Args: d1: instance of a Beta distribution object. d2: instance of a Beta distribution object. name: (optional) Name to use for created operations. default is "kl_beta_beta". Returns: Batchwise KL(d1 || d2) """ def delta(fn, is_property=True): fn1 = getattr(d1, fn) fn2 = getattr(d2, fn) return (fn2 - fn1) if is_property else (fn2() - fn1()) with tf.name_scope(name, "kl_beta_beta", values=[ d1.concentration1, d1.concentration0, d1.total_concentration, d2.concentration1, d2.concentration0, d2.total_concentration, ]): return (delta("_log_normalization", is_property=False) - tf.digamma(d1.concentration1) * delta("concentration1") - tf.digamma(d1.concentration0) * delta("concentration0") + (tf.digamma(d1.total_concentration) * delta("total_concentration")))
def expected_log_pi(dir_standard_param): with tf.name_scope('dirichlet_expectation'): return tf.subtract( tf.digamma(dir_standard_param), tf.digamma( tf.reduce_sum(dir_standard_param, axis=-1, keep_dims=True)), name='expected_mixing_coeffs')
def _entropy(self): return ( self._log_normalization() - (self.concentration1 - 1.) * tf.digamma(self.concentration1) - (self.concentration0 - 1.) * tf.digamma(self.concentration0) + ((self.total_concentration - 2.) * tf.digamma(self.total_concentration)))
def simple_graph_edge_update(q_theta, q_beta, q_gam, q_omega, es_ind): """ For occupied pair (i,j) w index m the update is: edge_param[i,j,:] <- E[log(theta[i,:])] + E[log(beta[j,:])] + E[log(gam[i])] + E[log(omega[i])] where edge_param is log(lam), the natural parameters of the truncated Poisson distribution Peak memory cost for this operation is approximately edges * K * (1 + 1/num_splits) Compute time scales linearly with num_splits """ # E[log(X)] ltheta = tf.digamma(q_theta.concentration) - tf.log(q_theta.rate) lbeta = tf.digamma(q_beta.concentration) - tf.log(q_beta.rate) lgam = tf.digamma(q_gam.concentration) - tf.log(q_gam.rate) lomega = tf.digamma(q_omega.concentration) - tf.log(q_omega.rate) user_params = lgam + ltheta item_params = lomega + lbeta # for occupied pair (i,j) w index m we have oc_theta[m]=ltheta[i,:] oc_user_params = tf.gather(user_params, es_ind[:, 0]) oc_item_params = tf.gather(item_params, es_ind[:, 1]) edge_params = oc_user_params + oc_item_params return edge_params
def get_loss(self): # First term dirichlet_expectation = tf.digamma(self.alpha_t) - tf.tile(tf.expand_dims(tf.digamma(self.alpha0), -1), [1, 1, self.n_classes]) y = tf.one_hot(self.y, self.n_classes) self.loss_ent_unc = -self.aggregate(tf.reduce_sum(dirichlet_expectation * y, -1), self.s) # Second term self.loss_pp = self.aggregate(-tf.log(self.alpha0 + 1e-8), self.s) x = tf.tile(self.x, [self.n_samples, 1]) tx = tf.tile(tf.expand_dims(self.tx, -1), [self.n_samples, 1, 1]) ty = tf.tile(self.ty, [self.n_samples, 1]) s = tf.tile(self.s, [self.n_samples]) t_sample = ty * tf.random_uniform(tf.shape(ty), 0, 1) rnn_input = tf.concat([self.mark_embedding(x), tx], -1) h = self.rnn(rnn_input, s, reuse=True) log_alpha_t_sample, _, _, _ = self.Dirichlet(h, t_sample) prior = 0 alpha_t_sample = tf.exp(log_alpha_t_sample - prior) alpha_0_sample = ty * tf.reduce_sum(alpha_t_sample, axis=-1) loss_pp = tf.reshape(alpha_0_sample, [self.n_samples] + get_shape(self.x)) # [10, B, S] loss_pp = tf.reduce_mean(loss_pp, 0) # [10, B, S] -> [B, S] self.loss_pp += tf.reduce_mean(loss_pp) loss = self.loss_ent_unc + self.loss_pp return loss
def tf_dirichlet_expectation(alpha): if len(alpha.get_shape()) == 1: return tf.subtract(tf.digamma(tf.add(alpha, np.finfo(np.float32).eps)), tf.digamma(tf.reduce_sum(alpha))) return tf.subtract(tf.digamma(alpha), tf.digamma(tf.reduce_sum(alpha, 1))[:, tf.newaxis])
def _statistic(self, statistic, name): if statistic == '_alpha_total': return tf.reduce_sum(self._alpha, -1, True, name=name) elif statistic == '_alpha_totalm1': return tf.reduce_sum(self._alpha - 1.0, -1, True, name=name) elif statistic == 1: return tf.divide(self._alpha, self.statistic('_alpha_total'), name) elif statistic == 2: return tf.add(tf.square(self.statistic(1)), self.statistic('var'), name) elif statistic == 'var': _alpha_total = self.statistic('_alpha_total') return tf.divide(self._alpha * (_alpha_total - self._alpha), tf.square(_alpha_total) * (_alpha_total + 1.0), name) elif statistic == 'log': return tf.subtract(tf.digamma(self._alpha), tf.digamma(self.statistic('_alpha_total')), name) elif statistic == '_log_normalization': return tf.subtract(tf.reduce_sum(tf.lgamma(self._alpha), -1), tf.lgamma(self.statistic('_alpha_total')[..., 0]), name) elif statistic == 'entropy': return tf.add(self.statistic('_log_normalization'), self.statistic('_alpha_totalm1')[..., 0] * tf.digamma(self.statistic('_alpha_total')[..., 0]) - tf.reduce_sum((self._alpha - 1.0) * tf.digamma(self._alpha), -1), name) else: return super(DirichletDistribution, self)._statistic(statistic, name)
def entropy(self, alpha): """Entropy of probability distribution. Parameters ---------- alpha : tf.Tensor A n-D tensor with each :math:`\\alpha` constrained to :math:`\\alpha_i > 0`. Returns ------- tf.Tensor A tensor of one dimension less than the input. """ alpha = tf.cast(alpha, dtype=tf.float32) multivariate_idx = len(get_dims(alpha)) - 1 K = get_dims(alpha)[multivariate_idx] if multivariate_idx == 0: a = tf.reduce_sum(alpha) return tf.lbeta(alpha) + \ (a - K) * tf.digamma(a) - \ tf.reduce_sum((alpha-1.0) * tf.digamma(alpha)) else: a = tf.reduce_sum(alpha, multivariate_idx) return tf.lbeta(alpha) + \ (a - K) * tf.digamma(a) - \ tf.reduce_sum((alpha-1.0) * tf.digamma(alpha), multivariate_idx)
def _weight_hessian_ab( self, X, loc, scale, ): one_minus_loc = 1 - loc loc_times_scale = loc * scale one_minus_loc_times_scale = one_minus_loc * scale scalar_one = tf.constant(1, shape=(), dtype=self.dtype) if isinstance(X, tf.SparseTensor): # Using the dense matrix of the location model to serve the correct shapes for the sparse X. const1 = tf.sparse_add( tf.zeros_like(loc), X).__div__(-tf.sparse.add(X, -tf.ones_like(loc))) # Adding tf1.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below, # to_dense does not work. else: const1 = tf.log(X / (1 - X)) const2 = -tf.digamma(loc_times_scale) + tf.digamma( one_minus_loc_times_scale) + const1 const3 = scale * (-tf.polygamma(scalar_one, loc_times_scale) * loc + one_minus_loc * tf.polygamma(scalar_one, one_minus_loc_times_scale)) const = loc * one_minus_loc_times_scale * (const2 + const3) return const
def kl_divergence(self, other): assert isinstance(other, Beta) return other.log_norm - self.log_norm - tf.digamma( self.beta) * (other.beta - self.beta) - tf.digamma( self.alpha) * (other.alpha - self.alpha) + tf.digamma( self.sum) * (other.sum - self.sum)
def compute_log_pi(alpha_k): # Bishop eq 10.66 with tf.name_scope('compute_log_pi'): alpha_hat = tf.reduce_sum(alpha_k) return tf.subtract(tf.digamma(alpha_k), tf.digamma(alpha_hat), name='log_pi')
def _entropy(self): k = tf.cast(self.event_shape_tensor()[0], self.dtype) return (self._log_normalization() + ((self.total_concentration - k) * tf.digamma(self.total_concentration)) - tf.reduce_sum( (self.concentration - 1.) * tf.digamma(self.concentration), axis=-1))
def dirichlet_expectation(alpha): """ Dirichlet expectation computation \Psi(\alpha_{k}) - \Psi(\sum_{i=1}^{K}(\alpha_{i})) """ return tf.subtract(tf.digamma(tf.add(alpha, np.finfo(np.float32).eps)), tf.digamma(tf.reduce_sum(alpha)))
def KL(alpha, K): beta = tf.constant(np.ones((1, K)), dtype=tf.float32) S_alpha = tf.reduce_sum(alpha, axis=1, keepdims=True) KL = tf.reduce_sum((alpha - beta) * (tf.digamma(alpha) - tf.digamma(S_alpha)), axis=1, keepdims=True) + \ tf.lgamma(S_alpha) - tf.reduce_sum(tf.lgamma(alpha), axis=1, keepdims=True) + \ tf.reduce_sum(tf.lgamma(beta), axis=1, keepdims=True) - tf.lgamma(tf.reduce_sum(beta, axis=1, keepdims=True)) return KL
def _entropy(self): v = tf.ones(self.batch_shape_tensor(), dtype=self.dtype)[..., tf.newaxis] u = v * self.df[..., tf.newaxis] beta_arg = tf.concat([u, v], -1) / 2. return (tf.log(tf.abs(self.scale)) + 0.5 * tf.log(self.df) + tf.lbeta(beta_arg) + 0.5 * (self.df + 1.) * (tf.digamma(0.5 * (self.df + 1.)) - tf.digamma(0.5 * self.df)))
def _entropy(self): k = tf.cast(self.event_shape_tensor()[0], self.dtype) return ( self._log_normalization() + ((self.total_concentration - k) * tf.digamma(self.total_concentration)) - tf.reduce_sum( (self.concentration - 1.) * tf.digamma(self.concentration), axis=-1))
def call(self, inputs, **kwargs): # Tensorflow needs to be imported here so that the saved model can be loaded again import tensorflow as tf alpha, beta, alpha_beta = inputs log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma( x=alpha_beta) entropy = log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) \ + (alpha_beta - 2.0) * tf.digamma(x=alpha_beta) return tf.reduce_mean(entropy) * self.entropy_loss_bonus
def to_mu(self, alpha): """ :param alpha: (Tensor) :return: (Tensor) mu """ d1 = self.latent_dim - 1 digamma_d = T.digamma(alpha[:, -1:]) mu = T.digamma(alpha[:, :d1]) - digamma_d return mu
def get_layer_KL(number): a = self.layers[number].a b = self.layers[number].b term_1 = tf.divide(-b + 1, b + 1e-20) term_2 = tf.log( tf.divide(tf.multiply(a, b), alpha + 1e-20) + 1e-20) term_bracket = (tf.digamma(1.) - tf.digamma(b) - tf.divide(1., b + 1e-20)) term_3 = tf.multiply(tf.divide(a - alpha, a + 1e-20), term_bracket) return tf.reduce_sum(term_1 + term_2 + term_3)
def build_annot_KL(self): alpha_diff = self.alpha_tilde - self.alpha KL_annot = (tf.reduce_sum( tf.multiply(alpha_diff, tf.digamma(self.alpha_tilde))) - tf.reduce_sum( tf.digamma(tf.reduce_sum(self.alpha_tilde, 1)) * tf.reduce_sum(alpha_diff, 1)) + tf.reduce_sum( tf.lbeta(tf.matrix_transpose(self.alpha)) - tf.lbeta(tf.matrix_transpose(self.alpha_tilde)))) return KL_annot
def vae_loss(x, x_star): reconstruction_error = (self.original_dim * metrics.binary_crossentropy(x, x_star)) a0 = K.sum(alpha, axis=-1, keepdims=True) kl = (T.lgamma(a0) - K.sum(T.lgamma(alpha), axis=-1) + K.sum(alpha * T.digamma(alpha), axis=-1) - K.sum(alpha * T.digamma(a0), axis=-1) - K.mean(T.digamma(alpha) - T.digamma(a0), axis=-1)) return reconstruction_error + kl
def weight_loss(neighb_count: np.array, labels: np.array, weights: list) -> float: """Calculates loss for given neighbors and weights. Parameters ---------- neighb_count : numpy array describes for each point the number of neighbors with each label. Shape: (number of data points, number of labels) labels : numpy array label for each point. Shape: (number of data points, ) weights : list weight of each label. Length: number of labels Returns ------- float calculated loss """ # reset graph before each run tf.reset_default_graph() num_data, num_labels = neighb_count.shape label_counts = np.zeros([num_labels]) for label, count in Counter(labels).most_common(): label_counts[label] = count # neighbors matrix neigh_matx = tf.constant(neighb_count, dtype=tf.float32) # label count vector label_cnts = tf.constant(label_counts, dtype=tf.float32) # weights w = tf.constant(weights, dtype=tf.float32) # weight lookup list w_list = tf.reduce_sum(tf.one_hot(labels, num_labels) * w, axis=1) # label counts lookup list label_cnts_list = tf.reduce_sum(tf.one_hot(labels, num_labels) * label_cnts, axis=1) nx = w * num_data ny = label_cnts_list / w_list * \ tf.reduce_sum(neigh_matx * (w/label_cnts), axis=1) loss = (tf.reduce_sum(tf.digamma(nx) * w) \ + tf.reduce_sum(tf.digamma(ny) * w_list / label_cnts_list)) with tf.Session() as sess: return sess.run(loss)
def loss_eq4(p, alpha, K, global_step, annealing_step): loglikelihood = tf.reduce_mean( tf.reduce_sum( p * (tf.digamma(tf.reduce_sum(alpha, axis=1, keepdims=True)) - tf.digamma(alpha)), 1, keepdims=True)) KL_reg = tf.minimum(1.0, tf.cast(global_step / annealing_step, tf.float32)) * KL( (alpha - 1) * (1 - p) + 1, K) return loglikelihood + KL_reg
def elbo_mf(a1, be1, a2, be2, a, b, c, d, x, crt, N): Elogr = tf.digamma(a1) - tf.log(be1) Elogp = tf.digamma(a2) - tf.digamma(a2 + be2) Elog1_p = tf.digamma(be2) - tf.digamma(a2 + be2) log_B = tf.lgamma(a2) + tf.lgamma(be2) - tf.lgamma(a2 + be2) term1 = a1 * tf.log(be1) - tf.lgamma(a1) - log_B term2 = (a2 - c - tf.reduce_sum(x)) * Elogp + (a1 - a - tf.reduce_sum(crt)) * Elogr term3 = -(be1 - b) * (a1 / be1) + (be2 - d) * Elog1_p - N * (a1 / be1) * Elog1_p return term1 + term2 + term3
def KL(alpha,outputSize): beta=tf.constant(np.ones((1,outputSize)),dtype=tf.float32) S_alpha = tf.reduce_sum(alpha,axis=1,keep_dims=True) S_beta = tf.reduce_sum(beta,axis=1,keep_dims=True) lnB = tf.lgamma(S_alpha) - tf.reduce_sum(tf.lgamma(alpha),axis=1,keep_dims=True) lnB_uni = tf.reduce_sum(tf.lgamma(beta),axis=1,keep_dims=True) - tf.lgamma(S_beta) dg0 = tf.digamma(S_alpha) dg1 = tf.digamma(alpha) kl = tf.reduce_sum((alpha - beta)*(dg1-dg0),axis=1,keep_dims=True) + lnB + lnB_uni return kl
def loss_EDL(p, alpha, global_step, annealing_step, outputSize): S = tf.reduce_sum(alpha, axis=1, keep_dims=True) E = alpha - 1 A = tf.reduce_mean(tf.reduce_sum(p * (tf.digamma(S) - tf.digamma(alpha)),1, keepdims=True)) annealing_coef = tf.minimum(1.00,tf.cast(global_step/annealing_step,tf.float32)) alp = E*(1-p) + 1 B = annealing_coef * KL(alp,outputSize) return (A + B)
def entropy(alpha, beta): """ This function calculates the entropy of the beta distribution parameterised by the shape parameters alpha and beta. :param alpha: The shape parameter alpha, which must be a positive scalar. :param beta: The shape parameter beta, which must be a positive scalar. :return: The entropy value of the beta distribution parameterised by alpha and beta. """ total_concentration = alpha + beta return tf.lgamma(alpha) + tf.lgamma(beta) - tf.lgamma(total_concentration) - (alpha - 1.0) * tf.digamma(alpha) - \ (beta - 1.0) * tf.digamma(beta) + (total_concentration - 2.0) * tf.digamma(total_concentration)
def body(gtm1, gt, t, phi_dtm1): exp_E_log_theta_d = tf.exp( tf.digamma(gt) - tf.digamma(tf.reduce_sum(gt))) phi_dt = tf.ones(phi_d_shape) * exp_E_log_theta_d phi_dt *= exp_E_log_beta_d phinorm = tf.matmul( exp_E_log_beta_d, tf.expand_dims(exp_E_log_theta_d, axis=-1)) + 1e-6 phi_dt /= phinorm gtp1 = a + tf.reduce_sum(phi_dt, axis=0) gtp1.set_shape([self.K]) phi_dt.set_shape([None, self.K]) return gt, gtp1, t + 1, phi_dt
def _compute_elbo(self, gammas, phi, data): self._log_lik = 0.0 kl_thetas = [] kl_zs = [] for i, (p, d) in enumerate(zip(phi, data)): g = gammas[i] # Data log-likelihood: lambdas = tf.nn.softmax(self.lambdas) lambdas = tf.clip_by_value(lambdas, 1e-2, 1-1e-2) word_proportions = tf.gather( tf.transpose(lambdas, [1, 0]), d) word_proportions = tf.expand_dims(word_proportions, -1) p = tf.expand_dims(p, 1) log_lik = tf.matmul(p, tf.log(word_proportions))[:, 0, 0] log_lik = tf.reduce_sum(log_lik, axis=0) self._log_lik += log_lik # KL[q(z|phi) || p(z|theta)] E_log_theta = tf.digamma(g) - tf.digamma(tf.reduce_sum(g)) p = tf.clip_by_value(p, 1e-3, 1 - 1e-3) kl_z = tf.reduce_sum((tf.log(p) - E_log_theta) * p) kl_zs.append(kl_z) # KL[q(theta|gamma) || q(theta|alpha)] a = self.alpha[i] kl_theta_d = tf.lgamma(tf.reduce_sum(g)) kl_theta_d -= tf.reduce_sum(tf.lgamma(g)) kl_theta_d -= tf.lgamma(tf.reduce_sum(a)) kl_theta_d += tf.reduce_sum(tf.lgamma(a)) kl_theta_d += tf.reduce_sum((g - a) * E_log_theta) kl_thetas.append(kl_theta_d) # KL[q(beta|lambda) || p(beta|eta)] E_log_beta = tf.digamma(self.lambdas) E_log_beta -= tf.digamma(tf.reduce_sum( self.lambdas, axis=1, keep_dims=True)) kl_beta = tf.lgamma(tf.reduce_sum(self.lambdas, axis=1)) kl_beta -= tf.reduce_sum(tf.lgamma(self.lambdas), axis=1) kl_beta -= tf.lgamma(tf.reduce_sum(self.eta, axis=1)) kl_beta += tf.reduce_sum(tf.lgamma(self.eta), axis=1) kl_beta += tf.reduce_sum((self.lambdas-self.eta)*E_log_beta, axis=1) kl_beta = tf.reduce_sum(kl_beta, axis=0) self._kl_terms = OrderedDict( kl_z=tf.reduce_sum(kl_zs, axis=0), kl_beta=kl_beta, kl_theta=tf.reduce_sum(kl_thetas, axis=0), ) kl_list = list(six.itervalues(self._kl_terms)) self._elbo = self._log_lik - tf.reduce_sum(kl_list)
def beta_kl_divergence(sample, prior_alpha, prior_beta): mu = tf.math.reduce_mean(sample) var = tf.reduce_mean(tf.squared_difference(sample, mu)) + EPSILON observed_alpha = ((1. - mu) / var - (1. / (mu + EPSILON))) * tf.square(mu) observed_beta = observed_alpha * (1. / (mu + EPSILON) - 1) return (tf.lgamma(prior_alpha + prior_beta) - (tf.lgamma(prior_alpha) + tf.lgamma(prior_beta)) - (tf.lgamma(observed_alpha + observed_beta + EPSILON)) + (tf.lgamma(observed_alpha + EPSILON) + tf.lgamma(observed_beta + EPSILON)) + (prior_alpha - observed_alpha) * (tf.digamma(prior_alpha) - tf.digamma(prior_alpha + prior_beta)) + (prior_beta - observed_beta) * (tf.digamma(prior_beta) - tf.digamma(prior_alpha + prior_beta)))
def masked_cross_entropy_dirichlet(preds, labels, mask): """Softmax cross-entropy loss with masking.""" preds = preds + tf.constant(1.0) S = tf.reduce_sum(preds, axis=1) S = tf.reshape(S, [-1, 1]) # prob = tf.div(preds, S) s_digmma = tf.digamma(S) loss = labels * (s_digmma - tf.digamma(preds)) loss = tf.reduce_sum(loss, axis=1) # loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) loss *= mask return tf.reduce_mean(loss)
def _harmonic_number(x): """Compute the harmonic number from its analytic continuation. Derivation from [here]( https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers) and [Euler's constant]( https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant). Args: x: input float. Returns: z: The analytic continuation of the harmonic number for the input. """ one = tf.ones([], dtype=x.dtype) return tf.digamma(x + one) - tf.digamma(one)
def entropy(self, a, b): """Entropy of probability distribution. Parameters ---------- a : tf.Tensor A n-D tensor with all elements constrained to :math:`a > 0`. b : tf.Tensor A n-D tensor with all elements constrained to :math:`b > 0`. Returns ------- tf.Tensor A tensor of same shape as input. """ a = tf.cast(tf.squeeze(a), dtype=tf.float32) b = tf.cast(tf.squeeze(b), dtype=tf.float32) if len(a.get_shape()) == 0: return tf.lbeta(tf.pack([a, b])) - \ (a - 1.0) * tf.digamma(a) - \ (b - 1.0) * tf.digamma(b) + \ (a + b - 2.0) * tf.digamma(a+b) else: return tf.lbeta(tf.concat(1, [tf.expand_dims(a, 1), tf.expand_dims(b, 1)])) - \ (a - 1.0) * tf.digamma(a) - \ (b - 1.0) * tf.digamma(b) + \ (a + b - 2.0) * tf.digamma(a+b)
def _chain_gets_correct_expectations(self, x, independent_chain_ndims): counter = collections.Counter() def log_gamma_log_prob(x): counter['target_calls'] += 1 event_dims = tf.range(independent_chain_ndims, tf.rank(x)) return self._log_gamma_log_prob(x, event_dims) samples, kernel_results = tfp.mcmc.sample_chain( num_results=150, current_state=x, kernel=tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=log_gamma_log_prob, step_size=0.05, num_leapfrog_steps=2, seed=_set_seed(42)), num_burnin_steps=150, parallel_iterations=1) if tf.executing_eagerly(): # TODO(b/79991421): Figure out why this is approx twice as many as it # should be. I.e., `expected_calls = (150 + 150) * 2 + 1`. expected_calls = 1202 else: expected_calls = 2 self.assertAllEqual(dict(target_calls=expected_calls), counter) expected_x = (tf.digamma(self._shape_param) - np.log(self._rate_param)) expected_exp_x = self._shape_param / self._rate_param log_accept_ratio_, samples_, expected_x_ = self.evaluate( [kernel_results.log_accept_ratio, samples, expected_x]) actual_x = samples_.mean() actual_exp_x = np.exp(samples_).mean() acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.)) tf.logging.vlog(1, 'True E[x, exp(x)]: {}\t{}'.format( expected_x_, expected_exp_x)) tf.logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format( actual_x, actual_exp_x)) self.assertNear(actual_x, expected_x_, 2e-2) self.assertNear(actual_exp_x, expected_exp_x, 2e-2) self.assertAllEqual(np.ones_like(acceptance_probs, np.bool), acceptance_probs > 0.5) self.assertAllEqual(np.ones_like(acceptance_probs, np.bool), acceptance_probs <= 1.)
def entropy(self, a, scale=1): """Entropy of probability distribution. Parameters ---------- a : tf.Tensor **Shape** parameter. A n-D tensor with all elements constrained to :math:`a > 0`. scale : tf.Tensor **Scale** parameter. A n-D tensor with all elements constrained to :math:`scale > 0`. Returns ------- tf.Tensor A tensor of same shape as input. """ a = tf.cast(a, dtype=tf.float32) scale = tf.cast(scale, dtype=tf.float32) return a + tf.log(scale*tf.exp(tf.lgamma(a))) - \ (1.0 + a) * tf.digamma(a)
def _kl_dirichlet_dirichlet(d1, d2, name=None): """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet. Args: d1: instance of a Dirichlet distribution object. d2: instance of a Dirichlet distribution object. name: (optional) Name to use for created operations. default is "kl_dirichlet_dirichlet". Returns: Batchwise KL(d1 || d2) """ with tf.name_scope(name, "kl_dirichlet_dirichlet", values=[ d1.concentration, d2.concentration]): # The KL between Dirichlet distributions can be derived as follows. We have # # Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)] # # where B(a) is the multivariate Beta function: # # B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n]) # # The KL is # # KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))} # # so we'll need to know the log density of the Dirichlet. This is # # log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a) # # The only term that matters for the expectations is the log(x[i]). To # compute the expectation of this term over the Dirichlet density, we can # use the following facts about the Dirichlet in exponential family form: # 1. log(x[i]) is a sufficient statistic # 2. expected sufficient statistics (of any exp family distribution) are # equal to derivatives of the log normalizer with respect to # corresponding natural parameters: E{T[i](x)} = dA/d(eta[i]) # # To proceed, we can rewrite the Dirichlet density in exponential family # form as follows: # # Dir(x; a) = exp{eta(a) . T(x) - A(a)} # # where '.' is the dot product of vectors eta and T, and A is a scalar: # # eta[i](a) = a[i] - 1 # T[i](x) = log(x[i]) # A(a) = log B(a) # # Now, we can use fact (2) above to write # # E_Dir(x; a)[log(x[i])] # = dA(a) / da[i] # = d/da[i] log B(a) # = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j]) # = digamma(a[i])) - digamma(sum_j a[j]) # # Putting it all together, we have # # KL[Dir(x; a) || Dir(x; b)] # = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)} # = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b)) # = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b) # = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))] # - lbeta(a) + lbeta(b)) digamma_sum_d1 = tf.digamma( tf.reduce_sum(d1.concentration, axis=-1, keepdims=True)) digamma_diff = tf.digamma(d1.concentration) - digamma_sum_d1 concentration_diff = d1.concentration - d2.concentration return (tf.reduce_sum(concentration_diff * digamma_diff, axis=-1) - tf.lbeta(d1.concentration) + tf.lbeta(d2.concentration))
def _multi_digamma(self, a, p, name="multi_digamma"): """Computes the multivariate digamma function; Psi_p(a).""" with self._name_scope(name, values=[a, p]): seq = self._multi_gamma_sequence(a, p) return tf.reduce_sum(tf.digamma(seq), axis=[-1])
def _entropy(self): return (self.concentration + tf.log(self.rate) + tf.lgamma( self.concentration) - ( (1. + self.concentration) * tf.digamma(self.concentration)))