def elbo_components(self, inputs, training=None, mask=None, **kwargs): llk, kl = super().elbo_components(inputs=inputs, mask=mask, training=training) P, Q = self.last_outputs n_latents = len(self.ladder_latents) // 2 for i in range(n_latents): pz = [p for p in P if f'ladder_p{i}' in p.name][0] qz = [q for q in Q if f'ladder_q{i}' in q.name][0] kl[f'kl_ladder{i}'] = self.beta * kl_divergence( q=qz, p=pz, analytic=self.analytic, free_bits=self.free_bits, reverse=self.reverse) return llk, kl
def kl_divergence(self, prior=None, analytic=True, sample_shape=1, reverse=True): """ KL(q||p) where `p` is the posterior distribution returned from last call Parameters ----------- prior : instance of `tensorflow_probability.Distribution` prior distribution of the latent analytic : `bool` (default=`True`). Using closed form solution for calculating divergence, otherwise, sampling with MCMC reverse : `bool`. If `True`, calculate `KL(q||p)` else `KL(p||q)` sample_shape : `int` (default=`1`) number of MCMC sample if `analytic=False` Returns -------- kullback_divergence : Tensor [sample_shape, batch_size, ...] """ if prior is None: prior = self._prior assert isinstance(prior, Distribution), "prior is not given!" if self.posterior is None: raise RuntimeError( "DistributionDense must be called to create the distribution before " "calculating the kl-divergence.") kullback_div = kl_divergence(q=self.posterior, p=prior, analytic=bool(analytic), reverse=reverse, q_sample=sample_shape) if analytic: kullback_div = tf.expand_dims(kullback_div, axis=0) if isinstance(sample_shape, Number) and sample_shape > 1: ndims = kullback_div.shape.ndims kullback_div = tf.tile(kullback_div, [sample_shape] + [1] * (ndims - 1)) return kullback_div
def call(self, inputs): docs_topics_posterior = self.encoder(inputs) docs_topics_samples = docs_topics_posterior.sample(self.n_mcmc_samples) # [n_topics, n_words] topics_words_probs = tf.nn.softmax(self.topics_words_logits, axis=1) # [n_docs, n_words] docs_words_probs = tf.matmul(docs_topics_samples, topics_words_probs) output_dist = self.decoder( tf.clip_by_value(docs_words_probs, 1e-4, 1 - 1e-4)) # initiate prior, concentration is clipped to stable range # for Dirichlet concentration = tf.clip_by_value(tf.nn.softplus(self.prior_logit), 1e-3, 1e3) topics_prior = Dirichlet(concentration=concentration, name="topics_prior") # ELBO kl = kl_divergence(q=docs_topics_posterior, p=topics_prior, analytic=self.analytic, q_sample=self.n_mcmc_samples, auto_remove_independent=True) if self.analytic: kl = tf.expand_dims(kl, axis=0) llk = output_dist.log_prob(inputs) ELBO = llk - kl # maximizing ELBO, hence, minizing following loss self.add_loss(tf.reduce_mean(-ELBO)) self.add_metric(tf.reduce_mean(kl), aggregation='mean', name="MeanKL") self.add_metric(tf.reduce_mean(-llk), aggregation='mean', name="MeanNLLK") return output_dist
def kl_divergence(self, analytic: bool = False, reverse: bool = False, free_bits: Optional[float] = None, raise_not_init: bool = True) -> tf.Tensor: if self._disable: return tf.zeros((), dtype=self.dtype) if raise_not_init: if self._posterior is None: raise ValueError( 'No posterior for the hierarchical latent variable.') if self._prior is None: raise ValueError( "This HierarchicalLatents haven't been called.") elif self._posterior is None or self._prior is None: return tf.zeros((), dtype=self.dtype) qz = self.posterior pz = self.prior kld = kl_divergence(q=qz, p=pz, analytic=analytic, reverse=reverse, free_bits=free_bits) return self.beta * kld
def _elbo(self, inputs, pX_Z, qZ_X, mask, training): org_inputs = inputs inputs = inputs[:len(self.output_layers)] if mask is None: if len(org_inputs) == len(self.output_layers): # no labelled X_unlabelled = inputs else: # all data is labelled X_unlabelled = [ tf.zeros(shape=(0, ) + i.shape[1:]) for i in inputs ] else: m = tf.logical_not(tf.reshape(mask, (-1, ))) X_unlabelled = [tf.boolean_mask(i, m, axis=0) for i in inputs] ## prepare inputs as usual org_inputs, y, mask = self.prepare_inputs(org_inputs, mask) X_labelled = [tf.boolean_mask(i, mask, axis=0) for i in org_inputs] ## Normal ELBO llk, div = super()._elbo(org_inputs, pX_Z, qZ_X, mask=mask, training=training) mask = tf.reshape(mask, (-1, )) ### for unlabelled data mask_unlabelled = tf.logical_not(mask) pY_X = self.classify(X_unlabelled) probs = pY_X.probs_parameter() # log-likehood llk_unlabelled = {} for name, lk in llk.items(): lk = tf.transpose(lk) lk = tf.boolean_mask(lk, mask_unlabelled, axis=0) lk = tf.transpose( tf.reshape(lk, (self.n_labels, tf.shape(probs)[0], -1))) lk = tf.reduce_sum(lk * probs, axis=-1) llk_unlabelled[name + '_unlabelled'] = lk # kl-divergence div_unlabelled = {} for name, dv in div.items(): dv = tf.transpose(dv) dv = tf.boolean_mask(dv, mask_unlabelled, axis=0) dv = tf.transpose( tf.reshape(dv, (self.n_labels, tf.shape(probs)[0], -1))) dv = tf.reduce_sum(dv * probs, axis=-1) div_unlabelled[name + '_unlabelled'] = dv div_unlabelled['kl_classifier'] = kl_divergence(pY_X, self.labels.prior, analytic=True) ### for labelled data, add the discriminative objective # log-likehood llk_labelled = { name + '_labelled': tf.transpose(tf.boolean_mask(tf.transpose(lk), mask, axis=0)) for name, lk in llk.items() } # add the classification (discrimination) loss y_labelled = tf.boolean_mask(y, mask, axis=0) pY_X = self.classify(X_labelled) llk_labelled['llk_classifier'] = self.alpha * pY_X.log_prob(y_labelled) # kl-divergence div_labelled = { name + '_labelled': tf.transpose(tf.boolean_mask(tf.transpose(dv), mask, axis=0)) for name, dv in div.items() } ### merge everything llk = { k: tf.reduce_mean(v) for k, v in dict(**llk_unlabelled, **llk_labelled).items() } div = { k: tf.reduce_mean(v) for k, v in dict(**div_unlabelled, **div_labelled).items() } return llk, div