def test_kl_multivariate_normal_0d(self): with self.test_session(): loc_one = tf.constant(0.0) scale_one = tf.constant(1.0) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant(10.0) scale_one = tf.constant(2.0) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 50.806854) loc_one = tf.constant(0.0) scale_one = tf.constant(1.0) loc_two = tf.constant(0.0) scale_two = tf.constant(1.0) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.0) loc_one = tf.constant(10.0) scale_one = tf.constant(2.0) loc_two = tf.constant(10.0) scale_two = tf.constant(5.0) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.496290802)
def test_kl_multivariate_normal_2d(self): with self.test_session(): loc_one = tf.constant([[0.0, 0.0], [0.0, 0.0]]) scale_one = tf.constant([[1.0, 1.0], [1.0, 1.0]]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), np.array([0.0, 0.0])) loc_one = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_one = tf.constant([[2.0, 2.0], [2.0, 2.0]]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), np.array([101.61370849, 101.61370849])) loc_one = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_one = tf.constant([[2.0, 2.0], [2.0, 2.0]]) loc_two = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_two = tf.constant([[2.0, 2.0], [2.0, 2.0]]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), np.array([0.0, 0.0])) loc_one = tf.constant([[10.0, 10.0], [0.0, 0.0]]) scale_one = tf.constant([[2.0, 2.0], [1.0, 1.0]]) loc_two = tf.constant([[9.0, 9.0], [0.0, 0.0]]) scale_two = tf.constant([[1.0, 1.0], [1.0, 1.0]]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), np.array([2.6137056350, 0.0]))
def test_kl_multivariate_normal_0d(self): with self.test_session(): loc_one = tf.constant(0.0) scale_one = tf.constant(1.0) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant(10.0) scale_one = tf.constant(2.0) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 50.806854) loc_one = tf.constant(0.0) scale_one = tf.constant(1.0) loc_two = tf.constant(0.0) scale_two = tf.constant(1.0) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.0) loc_one = tf.constant(10.0) scale_one = tf.constant(2.0) loc_two = tf.constant(10.0) scale_two = tf.constant(5.0) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.496290802)
def build_reparam_kl_loss(inference): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x | z) ] + KL(q(z; \lambda) || p(z)) ) based on the reparameterization trick (Kingma and Welling, 2014). It assumes the KL is analytic. For model wrappers, it assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples for s in range(inference.n_samples): z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on posterior sample or # observed data. dict_swap = z_sample for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): dict_swap[x] = obs for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_lik[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = inference.data p_log_lik[s] = inference.model_wrapper.log_lik(x, z_sample) p_log_lik = tf.pack(p_log_lik) if inference.model_wrapper is None: kl = tf.reduce_sum([ tf.reduce_sum( kl_multivariate_normal(qz.mu, qz.sigma, z.mu, z.sigma)) for z, qz in six.iteritems(inference.latent_vars) ]) else: kl = tf.reduce_sum([ tf.reduce_sum(kl_multivariate_normal(qz.mu, qz.sigma)) for qz in six.itervalues(inference.latent_vars) ]) loss = -(tf.reduce_mean(p_log_lik) - kl) return loss
def test_kl_multivariate_normal_2d(self): with self.test_session(): loc_one = tf.constant([[0.0, 0.0], [0.0, 0.0]]) scale_one = tf.constant([[1.0, 1.0], [1.0, 1.0]]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), np.array([0.0, 0.0])) loc_one = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_one = tf.constant([[2.0, 2.0], [2.0, 2.0]]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), np.array([101.61370849, 101.61370849])) loc_one = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_one = tf.constant([[2.0, 2.0], [2.0, 2.0]]) loc_two = tf.constant([[10.0, 10.0], [10.0, 10.0]]) scale_two = tf.constant([[2.0, 2.0], [2.0, 2.0]]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), np.array([0.0, 0.0])) loc_one = tf.constant([[10.0, 10.0], [0.0, 0.0]]) scale_one = tf.constant([[2.0, 2.0], [1.0, 1.0]]) loc_two = tf.constant([[9.0, 9.0], [0.0, 0.0]]) scale_two = tf.constant([[1.0, 1.0], [1.0, 1.0]]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), np.array([2.6137056350, 0.0]))
def build_score_loss_kl(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x | z) ] + KL(q(z; \lambda) || p(z)) ) based on the score function estimator. (Paisley et al., 2012) It assumes the KL is analytic. For model wrappers, it assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), replacing any conditioning on prior with # conditioning on posterior sample. x_copy = copy(x, dict_swap=z_sample, scope='inference_' + str(s)) p_log_lik[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_lik[s] = self.model_wrapper.log_lik(x, z_sample) p_log_lik = tf.pack(p_log_lik) q_log_prob = tf.pack(q_log_prob) if self.model_wrapper is None: kl = tf.reduce_sum([ kl_multivariate_normal(qz.mu, qz.sigma, z.mu, z.sigma) for z, qz in six.iteritems(self.latent_vars) ]) else: kl = tf.reduce_sum([ kl_multivariate_normal(qz.mu, qz.sigma) for qz in six.itervalues(self.latent_vars) ]) self.loss = tf.reduce_mean(p_log_lik) - kl return -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl)
def build_reparam_loss_kl(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x | z) ] + KL(q(z; \lambda) || p(z)) ) based on the reparameterization trick. (Kingma and Welling, 2014) It assumes the KL is analytic. It assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)` Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ x = self.data z = self.variational.sample(self.n_samples) mu = tf.pack([layer.loc for layer in self.variational.layers]) sigma = tf.pack([layer.scale for layer in self.variational.layers]) self.loss = tf.reduce_mean(self.model.log_lik(x, z)) - \ kl_multivariate_normal(mu, sigma) return -self.loss
def build_score_loss_kl(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x | z) ] + KL(q(z; \lambda) || p(z)) ) based on the score function estimator. (Paisley et al., 2012) It assumes the KL is analytic. It assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ x = self.data z = self.variational.sample(self.n_samples) q_log_prob = self.variational.log_prob(stop_gradient(z)) p_log_lik = self.model.log_lik(x, z) mu = tf.pack([layer.loc for layer in self.variational.layers]) sigma = tf.pack([layer.scale for layer in self.variational.layers]) kl = kl_multivariate_normal(mu, sigma) self.loss = tf.reduce_mean(p_log_lik) - kl return -(tf.reduce_mean(q_log_prob * stop_gradient(p_log_lik)) - kl)
def build_loss(self): # ELBO = E_{q(z | x)} [ log p(x | z) ] - KL(q(z | x) || p(z)) with tf.variable_scope("model") as scope: x = self.x # TODO samples 1 set of latent variables for each data point z, _ = self.variational.sample(x, self.n_data) mu = tf.pack([layer.m for layer in self.variational.layers]) sigma = tf.pack([layer.s for layer in self.variational.layers]) self.loss = tf.reduce_sum(self.model.log_lik(x, z)) - \ kl_multivariate_normal(mu, sigma) return -self.loss
def build_loss(self): # ELBO = E_{q(z | x)} [ log p(x | z) ] - KL(q(z | x) || p(z)) # In general, there should be a scale factor due to data # subsampling, so that # ELBO = N / M * ( ELBO using x_b ) # where x^b is a mini-batch of x, with sizes M and N respectively. # This is absorbed into the learning rate. with tf.variable_scope("model") as scope: self.variational.set_params(self.variational.mapping(self.x)) z = self.variational.sample(self.n_data) self.losses = tf.reduce_sum(self.model.log_likelihood(self.x, z)) - \ kl_multivariate_normal(self.variational.m, self.variational.s) return -self.losses
def build_reparam_loss_kl(self): """ Loss function to minimize, whose gradient is a stochastic gradient based on the reparameterization trick. ELBO = E_{q(z; lambda)} [ log p(x | z) ] + KL(q(z; lambda) || p(z)) where KL is analytic It assumes the model prior is p(z) = N(z; 0, 1). """ x = self.data.sample(self.n_data) z, self.samples = self.variational.sample(x, self.n_minibatch) mu = tf.pack([layer.m for layer in self.variational.layers]) sigma = tf.pack([layer.s for layer in self.variational.layers]) self.loss = tf.reduce_mean(self.model.log_lik(x, z)) - \ kl_multivariate_normal(mu, sigma) return -self.loss
def build_score_loss_kl(self): """ Loss function to minimize, whose gradient is a stochastic gradient based on the score function estimator. ELBO = E_{q(z; lambda)} [ log p(x | z) ] + KL(q(z; lambda) || p(z)) where KL is analytic It assumes the model prior is p(z) = N(z; 0, 1). """ x = self.data.sample(self.n_data) z, self.samples = self.variational.sample(x, self.n_minibatch) q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32) for i in range(self.variational.num_factors): q_log_prob += self.variational.log_prob_zi(i, tf.stop_gradient(z)) p_log_lik = self.model.log_lik(x, z) mu = tf.pack([layer.m for layer in self.variational.layers]) sigma = tf.pack([layer.s for layer in self.variational.layers]) kl = kl_multivariate_normal(mu, sigma) self.loss = tf.reduce_mean(p_log_lik) - kl return -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl)
def test_kl_multivariate_normal_1d(self): with self.test_session(): loc_one = tf.constant([0.0]) scale_one = tf.constant([1.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant([10.0]) scale_one = tf.constant([2.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 50.806854) loc_one = tf.constant([10.0]) scale_one = tf.constant([2.0]) loc_two = tf.constant([10.0]) scale_two = tf.constant([2.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.0) loc_one = tf.constant([0.0, 0.0]) scale_one = tf.constant([1.0, 1.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant([10.0, 10.0]) scale_one = tf.constant([2.0, 2.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one).eval(), 101.61370849) loc_one = tf.constant([10.0, 10.0]) scale_one = tf.constant([2.0, 2.0]) loc_two = tf.constant([9.0, 9.0]) scale_two = tf.constant([1.0, 1.0]) self.assertAllClose(kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 2.6137056350)
def test_kl_multivariate_normal_1d(self): with self.test_session(): loc_one = tf.constant([0.0]) scale_one = tf.constant([1.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant([10.0]) scale_one = tf.constant([2.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 50.806854) loc_one = tf.constant([10.0]) scale_one = tf.constant([2.0]) loc_two = tf.constant([10.0]) scale_two = tf.constant([2.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 0.0) loc_one = tf.constant([0.0, 0.0]) scale_one = tf.constant([1.0, 1.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 0.0) loc_one = tf.constant([10.0, 10.0]) scale_one = tf.constant([2.0, 2.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one).eval(), 101.61370849) loc_one = tf.constant([10.0, 10.0]) scale_one = tf.constant([2.0, 2.0]) loc_two = tf.constant([9.0, 9.0]) scale_two = tf.constant([1.0, 1.0]) self.assertAllClose( kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval(), 2.6137056350)
def test_contraint_raises(self): with self.test_session(): loc_one = tf.constant(10.0) scale_one = tf.constant(-1.0) loc_two = tf.constant(10.0) scale_two = tf.constant(-1.0) with self.assertRaisesOpError('Condition'): kl_multivariate_normal(loc_one, scale_one).eval() kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval() loc_one = np.inf * tf.constant(10.0) scale_one = tf.constant(1.0) loc_two = tf.constant(10.0) scale_two = tf.constant(1.0) with self.assertRaisesOpError('Inf'): kl_multivariate_normal(loc_one, scale_one).eval() kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval() loc_one = tf.constant(10.0) scale_one = tf.constant(1.0) loc_two = np.nan * tf.constant(10.0) scale_two = tf.constant(1.0) with self.assertRaisesOpError('NaN'): kl_multivariate_normal(loc_one, scale_one).eval() kl_multivariate_normal(loc_one, scale_one, loc_two=loc_two, scale_two=scale_two).eval()
def build_score_kl_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the KL is analytic. For model wrappers, it assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): scope = 'inference_' + str(id(inference)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() z_log_prob = tf.reduce_sum(qz.log_prob(tf.stop_gradient(z_sample[z]))) if z in inference.scale: z_log_prob *= inference.scale[z] q_log_prob[s] += z_log_prob if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_lik = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in inference.scale: x_log_lik *= inference.scale[x] p_log_lik[s] += x_log_lik else: x = inference.data p_log_lik[s] = inference.model_wrapper.log_lik(x, z_sample) p_log_lik = tf.pack(p_log_lik) q_log_prob = tf.pack(q_log_prob) if inference.model_wrapper is None: kl = tf.reduce_sum([inference.data.get(z, 1.0) * tf.reduce_sum(kl_multivariate_normal( qz.mu, qz.sigma, z.mu, z.sigma)) for z, qz in six.iteritems(inference.latent_vars)]) else: kl = tf.reduce_sum([tf.reduce_sum(kl_multivariate_normal(qz.mu, qz.sigma)) for qz in six.itervalues(inference.latent_vars)]) if var_list is None: var_list = tf.trainable_variables() loss = -(tf.reduce_mean(p_log_lik) - kl) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for x in six.iterkeys(data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap) x_log_lik = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) p_log_lik[0] += x_log_lik p_log_lik = tf.pack(p_log_lik) kl = tf.reduce_sum([ data.get(z, 1.0) * tf.reduce_sum(kl_multivariate_normal(qz.mu, qz.sigma, z.mu, z.sigma)) for z, qz in six.iteritems(latent_vars) ]) loss = -(tf.reduce_mean(p_log_lik) - kl) # benchmark the gradient time grads = tf.gradients(loss, [v.ref() for v in var_list])[0] init = tf.initialize_all_variables() feed_dict = {} for key, value in six.iteritems(data): if isinstance(key, tf.Tensor): feed_dict[key] = value init.run(feed_dict)
def build_reparam_kl_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -\\text{ELBO} = - ( \mathbb{E}_{q(z; \lambda)} [ \log p(x \mid z) ] + \\text{KL}(q(z; \lambda) \| p(z)) ) based on the reparameterization trick (Kingma and Welling, 2014). It assumes the KL is analytic. For model wrappers, it assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples for s in range(inference.n_samples): scope = 'inference_' + str(id(inference)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_lik = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in inference.scale: x_log_lik *= inference.scale[x] p_log_lik[s] += x_log_lik else: x = inference.data p_log_lik[s] = inference.model_wrapper.log_lik(x, z_sample) p_log_lik = tf.pack(p_log_lik) if inference.model_wrapper is None: kl = tf.reduce_sum([inference.data.get(z, 1.0) * tf.reduce_sum(kl_multivariate_normal( qz.mu, qz.sigma, z.mu, z.sigma)) for z, qz in six.iteritems(inference.latent_vars)]) else: kl = tf.reduce_sum([tf.reduce_sum(kl_multivariate_normal(qz.mu, qz.sigma)) for qz in six.itervalues(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_lik) - kl) if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients(loss, [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars