def testKLRaises(self): ind1 = independent_lib.Independent( distribution=normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=1) ind2 = independent_lib.Independent( distribution=normal_lib.Normal( loc=np.float32(-1), scale=np.float32(0.5)), reinterpreted_batch_ndims=0) with self.assertRaisesRegexp( ValueError, "Event shapes do not match"): kullback_leibler.kl_divergence(ind1, ind2) ind1 = independent_lib.Independent( distribution=normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=1) ind2 = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([-1., 1]), scale_diag=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=0) with self.assertRaisesRegexp( NotImplementedError, "different event shapes"): kullback_leibler.kl_divergence(ind1, ind2)
def testBetaBetaKL(self): with self.test_session() as sess: for shape in [(10,), (4, 5)]: a1 = 6.0 * np.random.random(size=shape) + 1e-4 b1 = 6.0 * np.random.random(size=shape) + 1e-4 a2 = 6.0 * np.random.random(size=shape) + 1e-4 b2 = 6.0 * np.random.random(size=shape) + 1e-4 # Take inverse softplus of values to test BetaWithSoftplusConcentration a1_sp = np.log(np.exp(a1) - 1.0) b1_sp = np.log(np.exp(b1) - 1.0) a2_sp = np.log(np.exp(a2) - 1.0) b2_sp = np.log(np.exp(b2) - 1.0) d1 = beta_lib.Beta(concentration1=a1, concentration0=b1) d2 = beta_lib.Beta(concentration1=a2, concentration0=b2) d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp, concentration0=b1_sp) d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp, concentration0=b2_sp) kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) + (a1 - a2) * special.digamma(a1) + (b1 - b2) * special.digamma(b1) + (a2 - a1 + b2 - b1) * special.digamma(a1 + b1)) for dist1 in [d1, d1_sp]: for dist2 in [d2, d2_sp]: kl = kullback_leibler.kl_divergence(dist1, dist2) kl_val = sess.run(kl) self.assertEqual(kl.get_shape(), shape) self.assertAllClose(kl_val, kl_expected) # Make sure KL(d1||d1) is 0 kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1)) self.assertAllClose(kl_same, np.zeros_like(kl_expected))
def testDirichletDirichletKL(self): conc1 = np.array([[1., 2., 3., 1.5, 2.5, 3.5], [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]]) conc2 = np.array([[0.5, 1., 1.5, 2., 2.5, 3.]]) d1 = dirichlet_lib.Dirichlet(conc1) d2 = dirichlet_lib.Dirichlet(conc2) x = d1.sample(int(1e4), seed=0) kl_sample = math_ops.reduce_mean(d1.log_prob(x) - d2.log_prob(x), 0) kl_actual = kullback_leibler.kl_divergence(d1, d2) kl_sample_val = self.evaluate(kl_sample) kl_actual_val = self.evaluate(kl_actual) self.assertEqual(conc1.shape[:-1], kl_actual.get_shape()) if not special: return kl_expected = ( special.gammaln(np.sum(conc1, -1)) - special.gammaln(np.sum(conc2, -1)) - np.sum(special.gammaln(conc1) - special.gammaln(conc2), -1) + np.sum((conc1 - conc2) * (special.digamma(conc1) - special.digamma( np.sum(conc1, -1, keepdims=True))), -1)) self.assertAllClose(kl_expected, kl_actual_val, atol=0., rtol=1e-6) self.assertAllClose(kl_sample_val, kl_actual_val, atol=0., rtol=1e-1) # Make sure KL(d1||d1) is 0 kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1)) self.assertAllClose(kl_same, np.zeros_like(kl_expected))
def testDomainErrorExceptions(self): class MyDistException(normal.Normal): pass # Register KL to a lambda that spits out the name parameter @kullback_leibler.RegisterKL(MyDistException, MyDistException) # pylint: disable=unused-argument,unused-variable def _kl(a, b, name=None): return array_ops.identity([float("nan")]) # pylint: disable=unused-argument,unused-variable with self.cached_session(): a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False) kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False) with self.assertRaisesOpError( "KL calculation between .* and .* returned NaN values"): self.evaluate(kl) with self.assertRaisesOpError( "KL calculation between .* and .* returned NaN values"): a.kl_divergence(a).eval() a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True) kl_ok = kullback_leibler.kl_divergence(a, a) self.assertAllEqual([float("nan")], self.evaluate(kl_ok)) self_kl_ok = a.kl_divergence(a) self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok)) cross_ok = a.cross_entropy(a) self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
def testCategoricalCategoricalKL(self): def np_softmax(logits): exp_logits = np.exp(logits) return exp_logits / exp_logits.sum(axis=-1, keepdims=True) with self.cached_session() as sess: for categories in [2, 10]: for batch_size in [1, 2]: p_logits = self._rng.random_sample((batch_size, categories)) q_logits = self._rng.random_sample((batch_size, categories)) p = onehot_categorical.OneHotCategorical(logits=p_logits) q = onehot_categorical.OneHotCategorical(logits=q_logits) prob_p = np_softmax(p_logits) prob_q = np_softmax(q_logits) kl_expected = np.sum( prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1) kl_actual = kullback_leibler.kl_divergence(p, q) kl_same = kullback_leibler.kl_divergence(p, p) x = p.sample(int(2e4), seed=0) x = math_ops.cast(x, dtype=dtypes.float32) # Compute empirical KL(p||q). kl_sample = math_ops.reduce_mean(p.log_prob(x) - q.log_prob(x), 0) [kl_sample_, kl_actual_, kl_same_] = sess.run([kl_sample, kl_actual, kl_same]) self.assertEqual(kl_actual.get_shape(), (batch_size,)) self.assertAllClose(kl_same_, np.zeros_like(kl_expected)) self.assertAllClose(kl_actual_, kl_expected, atol=0., rtol=1e-6) self.assertAllClose(kl_sample_, kl_expected, atol=1e-2, rtol=0.)
def testCategoricalCategoricalKL(self): def np_softmax(logits): exp_logits = np.exp(logits) return exp_logits / exp_logits.sum(axis=-1, keepdims=True) with self.cached_session() as sess: for categories in [2, 4]: for batch_size in [1, 10]: a_logits = np.random.randn(batch_size, categories) b_logits = np.random.randn(batch_size, categories) a = categorical.Categorical(logits=a_logits) b = categorical.Categorical(logits=b_logits) kl = kullback_leibler.kl_divergence(a, b) kl_val = sess.run(kl) # Make sure KL(a||a) is 0 kl_same = sess.run(kullback_leibler.kl_divergence(a, a)) prob_a = np_softmax(a_logits) prob_b = np_softmax(b_logits) kl_expected = np.sum(prob_a * (np.log(prob_a) - np.log(prob_b)), axis=-1) self.assertEqual(kl.get_shape(), (batch_size,)) self.assertAllClose(kl_val, kl_expected) self.assertAllClose(kl_same, np.zeros_like(kl_expected))
def _kl_independent(a, b, name="kl_independent"): """Batched KL divergence `KL(a || b)` for Independent distributions. We can leverage the fact that ``` KL(Independent(a) || Independent(b)) = sum(KL(a || b)) ``` where the sum is over the `reinterpreted_batch_ndims`. Args: a: Instance of `Independent`. b: Instance of `Independent`. name: (optional) name to use for created ops. Default "kl_independent". Returns: Batchwise `KL(a || b)`. Raises: ValueError: If the event space for `a` and `b`, or their underlying distributions don't match. """ p = a.distribution q = b.distribution # The KL between any two (non)-batched distributions is a scalar. # Given that the KL between two factored distributions is the sum, i.e. # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions. if a.event_shape.is_fully_defined() and b.event_shape.is_fully_defined(): if a.event_shape == b.event_shape: if p.event_shape == q.event_shape: num_reduce_dims = a.event_shape.ndims - p.event_shape.ndims reduce_dims = [-i - 1 for i in range(0, num_reduce_dims)] return math_ops.reduce_sum( kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims) else: raise NotImplementedError("KL between Independents with different " "event shapes not supported.") else: raise ValueError("Event shapes do not match.") else: with ops.control_dependencies([ check_ops.assert_equal(a.event_shape_tensor(), b.event_shape_tensor()), check_ops.assert_equal(p.event_shape_tensor(), q.event_shape_tensor()) ]): num_reduce_dims = ( array_ops.shape(a.event_shape_tensor()[0]) - array_ops.shape(p.event_shape_tensor()[0])) reduce_dims = math_ops.range(-num_reduce_dims - 1, -1, 1) return math_ops.reduce_sum( kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims)
def test_kl_reverse(self): with self.test_session() as sess: q = normal_lib.Normal(loc=np.ones(6), scale=np.array( [0.5, 1.0, 1.5, 2.0, 2.5, 3.0])) p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p_log_prob=p.log_prob, q=q, num_draws=int(1e5), seed=1) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p_log_prob=p.log_prob, q=q, num_draws=int(1e5), seed=1) exact_kl = kullback_leibler.kl_divergence(q, p) [approx_kl_, approx_kl_self_normalized_, exact_kl_ ] = sess.run([approx_kl, approx_kl_self_normalized, exact_kl]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.07, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.02, atol=0.)
def test_kl_reverse(self): with self.test_session() as sess: q = normal_lib.Normal( loc=np.ones(6), scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])) p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=int(1e5), seed=1) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=int(1e5), seed=1) exact_kl = kullback_leibler.kl_divergence(q, p) [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([ approx_kl, approx_kl_self_normalized, exact_kl]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.07, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.02, atol=0.)
def test_convergence_to_kl_using_sample_form_on_3dim_normal(self): # Test that the sample mean KL is the same as analytic when we use samples # to estimate every part of the KL divergence ratio. vector_shape = (2, 3) n_samples = 5000 with self.test_session(): q = mvn_diag_lib.MultivariateNormalDiag( loc=self._rng.rand(*vector_shape), scale_diag=self._rng.rand(*vector_shape)) p = mvn_diag_lib.MultivariateNormalDiag( loc=self._rng.rand(*vector_shape), scale_diag=self._rng.rand(*vector_shape)) # In this case, the log_ratio is the KL. sample_kl = -1 * entropy.elbo_ratio( log_p=p.log_prob, q=q, n=n_samples, form=entropy.ELBOForms.sample, seed=42) actual_kl = kullback_leibler_lib.kl_divergence(q, p) # Relative tolerance (rtol) chosen 2 times as large as minimim needed to # pass. self.assertEqual((2,), sample_kl.get_shape()) self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
def testGammaGammaKL(self): alpha0 = np.array([3.]) beta0 = np.array([1., 2., 3., 1.5, 2.5, 3.5]) alpha1 = np.array([0.4]) beta1 = np.array([0.5, 1., 1.5, 2., 2.5, 3.]) # Build graph. with self.test_session() as sess: g0 = gamma_lib.Gamma(concentration=alpha0, rate=beta0) g1 = gamma_lib.Gamma(concentration=alpha1, rate=beta1) x = g0.sample(int(1e4), seed=0) kl_sample = math_ops.reduce_mean(g0.log_prob(x) - g1.log_prob(x), 0) kl_actual = kullback_leibler.kl_divergence(g0, g1) # Execute graph. [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual]) kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0) + special.gammaln(alpha1) - special.gammaln(alpha0) + alpha1 * np.log(beta0) - alpha1 * np.log(beta1) + alpha0 * (beta1 / beta0 - 1.)) self.assertEqual(beta0.shape, kl_actual.get_shape()) self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6) self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
def test_kl_reverse_multidim(self): with self.test_session() as sess: d = 5 # Dimension p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5)) q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[0.5]*d) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=int(1e5), seed=1) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=int(1e5), seed=1) exact_kl = kullback_leibler.kl_divergence(q, p) [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([ approx_kl, approx_kl_self_normalized, exact_kl]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.02, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.08, atol=0.)
def testDefaultVariationalAndPrior(self): _, prior, variational, _, log_likelihood = mini_vae() elbo = vi.elbo(log_likelihood) expected_elbo = log_likelihood - kullback_leibler.kl_divergence( variational.distribution, prior) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([expected_elbo, elbo]))
def testDefaultVariationalAndPrior(self): _, prior, variational, _, log_likelihood = mini_vae() elbo = vi.elbo(log_likelihood) expected_elbo = log_likelihood - kullback_leibler.kl_divergence( variational.distribution, prior) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([expected_elbo, elbo]))
def testKLIdentity(self): normal1 = normal_lib.Normal(loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])) # This is functionally just a wrapper around normal1, # and doesn't change any outputs. ind1 = independent_lib.Independent(distribution=normal1, reinterpreted_batch_ndims=0) normal2 = normal_lib.Normal(loc=np.float32([-3., 3]), scale=np.float32([0.3, 0.3])) # This is functionally just a wrapper around normal2, # and doesn't change any outputs. ind2 = independent_lib.Independent(distribution=normal2, reinterpreted_batch_ndims=0) normal_kl = kullback_leibler.kl_divergence(normal1, normal2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose(self.evaluate(normal_kl), self.evaluate(ind_kl))
def testKLScalarToMultivariate(self): normal1 = normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])) ind1 = independent_lib.Independent( distribution=normal1, reinterpreted_batch_ndims=1) normal2 = normal_lib.Normal( loc=np.float32([-3., 3]), scale=np.float32([0.3, 0.3])) ind2 = independent_lib.Independent( distribution=normal2, reinterpreted_batch_ndims=1) normal_kl = kullback_leibler.kl_divergence(normal1, normal2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose( self.evaluate(math_ops.reduce_sum(normal_kl, axis=-1)), self.evaluate(ind_kl))
def testKLScalarToMultivariate(self): normal1 = normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])) ind1 = independent_lib.Independent( distribution=normal1, reinterpreted_batch_ndims=1) normal2 = normal_lib.Normal( loc=np.float32([-3., 3]), scale=np.float32([0.3, 0.3])) ind2 = independent_lib.Independent( distribution=normal2, reinterpreted_batch_ndims=1) normal_kl = kullback_leibler.kl_divergence(normal1, normal2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose( self.evaluate(math_ops.reduce_sum(normal_kl, axis=-1)), self.evaluate(ind_kl))
def __init__( self, units, activation=None, activity_regularizer=None, trainable=True, kernel_posterior_fn=layers_util.default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence( q, p), bias_posterior_fn=layers_util.default_mean_field_normal_fn( is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), seed=None, name=None, **kwargs): # pylint: disable=g-doc-args """Construct layer. Args: @{args} """ # pylint: enable=g-doc-args super(DenseFlipout, self).__init__( units=units, activation=activation, activity_regularizer=activity_regularizer, trainable=trainable, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, kernel_prior_fn=kernel_prior_fn, kernel_divergence_fn=kernel_divergence_fn, bias_posterior_fn=bias_posterior_fn, bias_posterior_tensor_fn=bias_posterior_tensor_fn, bias_prior_fn=bias_prior_fn, bias_divergence_fn=bias_divergence_fn, name=name, **kwargs) self.seed = seed
def testExplicitVariationalAndPrior(self): with self.test_session() as sess: _, _, variational, _, log_likelihood = mini_vae() prior = normal.Normal(loc=3., scale=2.) elbo = vi.elbo(log_likelihood, variational_with_prior={variational: prior}) expected_elbo = log_likelihood - kullback_leibler.kl_divergence( variational.distribution, prior) sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([expected_elbo, elbo]))
def __init__( self, units, activation=None, activity_regularizer=None, trainable=True, kernel_posterior_fn=layers_util.default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence( q, p), bias_posterior_fn=layers_util.default_mean_field_normal_fn( is_singular=True), # pylint: disable=line-too-long bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, **kwargs): # pylint: disable=g-doc-args """Construct layer. Args: @{args} """ # pylint: enable=g-doc-args super(_DenseVariational, self).__init__(trainable=trainable, name=name, activity_regularizer=activity_regularizer, **kwargs) self.units = units self.activation = activation self.input_spec = layers_lib.InputSpec(min_ndim=2) self.kernel_posterior_fn = kernel_posterior_fn self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn self.kernel_prior_fn = kernel_prior_fn self.kernel_divergence_fn = kernel_divergence_fn self.bias_posterior_fn = bias_posterior_fn self.bias_posterior_tensor_fn = bias_posterior_tensor_fn self.bias_prior_fn = bias_prior_fn self.bias_divergence_fn = bias_divergence_fn
def testExplicitVariationalAndPrior(self): with self.test_session() as sess: _, _, variational, _, log_likelihood = mini_vae() prior = normal.Normal(loc=3., scale=2.) elbo = vi.elbo( log_likelihood, variational_with_prior={variational: prior}) expected_elbo = log_likelihood - kullback_leibler.kl_divergence( variational.distribution, prior) sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([expected_elbo, elbo]))
def testDomainErrorExceptions(self): class MyDistException(normal.Normal): pass # Register KL to a lambda that spits out the name parameter @kullback_leibler.RegisterKL(MyDistException, MyDistException) # pylint: disable=unused-argument,unused-variable def _kl(a, b, name=None): return array_ops.identity([float("nan")]) # pylint: disable=unused-argument,unused-variable with self.test_session(): a = MyDistException(loc=0.0, scale=1.0) kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False) with self.assertRaisesOpError( "KL calculation between .* and .* returned NaN values"): kl.eval() kl_ok = kullback_leibler.kl_divergence(a, a) self.assertAllEqual([float("nan")], kl_ok.eval())
def testRegistration(self): class MyDist(normal.Normal): pass # Register KL to a lambda that spits out the name parameter @kullback_leibler.RegisterKL(MyDist, MyDist) def _kl(a, b, name=None): # pylint: disable=unused-argument,unused-variable return name a = MyDist(loc=0.0, scale=1.0) self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
def testKLMultivariateToMultivariate(self): # (1, 1, 2) batch of MVNDiag mvn1 = mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([[[[-1., 1, 3.], [2., 4., 3.]]]]), scale_diag=np.float32([[[0.2, 0.1, 5.], [2., 3., 4.]]])) ind1 = independent_lib.Independent( distribution=mvn1, reinterpreted_batch_ndims=2) # (1, 1, 2) batch of MVNDiag mvn2 = mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([[[[-2., 3, 2.], [1., 3., 2.]]]]), scale_diag=np.float32([[[0.1, 0.5, 3.], [1., 2., 1.]]])) ind2 = independent_lib.Independent( distribution=mvn2, reinterpreted_batch_ndims=2) mvn_kl = kullback_leibler.kl_divergence(mvn1, mvn2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose( self.evaluate(math_ops.reduce_sum(mvn_kl, axis=[-1, -2])), self.evaluate(ind_kl))
def testKLMultivariateToMultivariate(self): # (1, 1, 2) batch of MVNDiag mvn1 = mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([[[[-1., 1, 3.], [2., 4., 3.]]]]), scale_diag=np.float32([[[0.2, 0.1, 5.], [2., 3., 4.]]])) ind1 = independent_lib.Independent(distribution=mvn1, reinterpreted_batch_ndims=2) # (1, 1, 2) batch of MVNDiag mvn2 = mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([[[[-2., 3, 2.], [1., 3., 2.]]]]), scale_diag=np.float32([[[0.1, 0.5, 3.], [1., 2., 1.]]])) ind2 = independent_lib.Independent(distribution=mvn2, reinterpreted_batch_ndims=2) mvn_kl = kullback_leibler.kl_divergence(mvn1, mvn2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose( self.evaluate(math_ops.reduce_sum(mvn_kl, axis=[-1, -2])), self.evaluate(ind_kl))
def testRegistration(self): class MyDist(normal.Normal): pass # Register KL to a lambda that spits out the name parameter @kullback_leibler.RegisterKL(MyDist, MyDist) def _kl(a, b, name=None): # pylint: disable=unused-argument,unused-variable return name a = MyDist(loc=0.0, scale=1.0) self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
def testKLIdentity(self): normal1 = normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])) # This is functionally just a wrapper around normal1, # and doesn't change any outputs. ind1 = independent_lib.Independent( distribution=normal1, reinterpreted_batch_ndims=0) normal2 = normal_lib.Normal( loc=np.float32([-3., 3]), scale=np.float32([0.3, 0.3])) # This is functionally just a wrapper around normal2, # and doesn't change any outputs. ind2 = independent_lib.Independent( distribution=normal2, reinterpreted_batch_ndims=0) normal_kl = kullback_leibler.kl_divergence(normal1, normal2) ind_kl = kullback_leibler.kl_divergence(ind1, ind2) self.assertAllClose( self.evaluate(normal_kl), self.evaluate(ind_kl))
def testCategoricalCategoricalKL(self): def np_softmax(logits): exp_logits = np.exp(logits) return exp_logits / exp_logits.sum(axis=-1, keepdims=True) with self.test_session() as sess: for categories in [2, 10]: for batch_size in [1, 2]: p_logits = self._rng.random_sample( (batch_size, categories)) q_logits = self._rng.random_sample( (batch_size, categories)) p = onehot_categorical.OneHotCategorical(logits=p_logits) q = onehot_categorical.OneHotCategorical(logits=q_logits) prob_p = np_softmax(p_logits) prob_q = np_softmax(q_logits) kl_expected = np.sum(prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1) kl_actual = kullback_leibler.kl_divergence(p, q) kl_same = kullback_leibler.kl_divergence(p, p) x = p.sample(int(2e4), seed=0) x = math_ops.cast(x, dtype=dtypes.float32) # Compute empirical KL(p||q). kl_sample = math_ops.reduce_mean( p.log_prob(x) - q.log_prob(x), 0) [kl_sample_, kl_actual_, kl_same_] = sess.run([kl_sample, kl_actual, kl_same]) self.assertEqual(kl_actual.get_shape(), (batch_size, )) self.assertAllClose(kl_same_, np.zeros_like(kl_expected)) self.assertAllClose(kl_actual_, kl_expected, atol=0., rtol=1e-6) self.assertAllClose(kl_sample_, kl_expected, atol=1e-2, rtol=0.)
def testIndirectRegistration(self): class Sub1(normal.Normal): pass class Sub2(normal.Normal): pass class Sub11(Sub1): pass # pylint: disable=unused-argument,unused-variable @kullback_leibler.RegisterKL(Sub1, Sub1) def _kl11(a, b, name=None): return "sub1-1" @kullback_leibler.RegisterKL(Sub1, Sub2) def _kl12(a, b, name=None): return "sub1-2" @kullback_leibler.RegisterKL(Sub2, Sub1) def _kl21(a, b, name=None): return "sub2-1" # pylint: enable=unused-argument,unused_variable sub1 = Sub1(loc=0.0, scale=1.0) sub2 = Sub2(loc=0.0, scale=1.0) sub11 = Sub11(loc=0.0, scale=1.0) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2)) self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2)) self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
def testIndirectRegistration(self): class Sub1(normal.Normal): pass class Sub2(normal.Normal): pass class Sub11(Sub1): pass # pylint: disable=unused-argument,unused-variable @kullback_leibler.RegisterKL(Sub1, Sub1) def _kl11(a, b, name=None): return "sub1-1" @kullback_leibler.RegisterKL(Sub1, Sub2) def _kl12(a, b, name=None): return "sub1-2" @kullback_leibler.RegisterKL(Sub2, Sub1) def _kl21(a, b, name=None): return "sub2-1" # pylint: enable=unused-argument,unused_variable sub1 = Sub1(loc=0.0, scale=1.0) sub2 = Sub2(loc=0.0, scale=1.0) sub11 = Sub11(loc=0.0, scale=1.0) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2)) self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1)) self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2)) self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11)) self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
def testKLRaises(self): ind1 = independent_lib.Independent(distribution=normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=1) ind2 = independent_lib.Independent(distribution=normal_lib.Normal( loc=np.float32(-1), scale=np.float32(0.5)), reinterpreted_batch_ndims=0) with self.assertRaisesRegexp(ValueError, "Event shapes do not match"): kullback_leibler.kl_divergence(ind1, ind2) ind1 = independent_lib.Independent(distribution=normal_lib.Normal( loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=1) ind2 = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=np.float32([-1., 1]), scale_diag=np.float32([0.1, 0.5])), reinterpreted_batch_ndims=0) with self.assertRaisesRegexp(NotImplementedError, "different event shapes"): kullback_leibler.kl_divergence(ind1, ind2)
def __init__( self, units, activation=None, activity_regularizer=None, trainable=True, kernel_use_local_reparameterization=True, kernel_posterior_fn=default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, **kwargs): super(DenseVariational, self).__init__( trainable=trainable, name=name, activity_regularizer=activity_regularizer, **kwargs) self._units = units self._activation = activation self._input_spec = layers_lib.InputSpec(min_ndim=2) self._kernel_use_local_reparameterization = ( kernel_use_local_reparameterization) self._kernel = VariationalKernelParameter( kernel_posterior_fn, kernel_posterior_tensor_fn, kernel_prior_fn, kernel_divergence_fn) self._bias = VariationalParameter( bias_posterior_fn, bias_posterior_tensor_fn, bias_prior_fn, bias_divergence_fn)
def testBetaBetaKL(self): with self.test_session() as sess: for shape in [(10, ), (4, 5)]: a1 = 6.0 * np.random.random(size=shape) + 1e-4 b1 = 6.0 * np.random.random(size=shape) + 1e-4 a2 = 6.0 * np.random.random(size=shape) + 1e-4 b2 = 6.0 * np.random.random(size=shape) + 1e-4 # Take inverse softplus of values to test BetaWithSoftplusConcentration a1_sp = np.log(np.exp(a1) - 1.0) b1_sp = np.log(np.exp(b1) - 1.0) a2_sp = np.log(np.exp(a2) - 1.0) b2_sp = np.log(np.exp(b2) - 1.0) d1 = beta_lib.Beta(concentration1=a1, concentration0=b1) d2 = beta_lib.Beta(concentration1=a2, concentration0=b2) d1_sp = beta_lib.BetaWithSoftplusConcentration( concentration1=a1_sp, concentration0=b1_sp) d2_sp = beta_lib.BetaWithSoftplusConcentration( concentration1=a2_sp, concentration0=b2_sp) if not special: return kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) + (a1 - a2) * special.digamma(a1) + (b1 - b2) * special.digamma(b1) + (a2 - a1 + b2 - b1) * special.digamma(a1 + b1)) for dist1 in [d1, d1_sp]: for dist2 in [d2, d2_sp]: kl = kullback_leibler.kl_divergence(dist1, dist2) kl_val = sess.run(kl) self.assertEqual(kl.get_shape(), shape) self.assertAllClose(kl_val, kl_expected) # Make sure KL(d1||d1) is 0 kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1)) self.assertAllClose(kl_same, np.zeros_like(kl_expected))
def __init__( self, units, activation=None, activity_regularizer=None, trainable=True, kernel_use_local_reparameterization=True, kernel_posterior_fn=default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence( q, p), bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, **kwargs): super(DenseVariational, self).__init__(trainable=trainable, name=name, activity_regularizer=activity_regularizer, **kwargs) self._units = units self._activation = activation self._input_spec = layers_lib.InputSpec(min_ndim=2) self._kernel_use_local_reparameterization = ( kernel_use_local_reparameterization) self._kernel = VariationalKernelParameter(kernel_posterior_fn, kernel_posterior_tensor_fn, kernel_prior_fn, kernel_divergence_fn) self._bias = VariationalParameter(bias_posterior_fn, bias_posterior_tensor_fn, bias_prior_fn, bias_divergence_fn)
def testBernoulliBernoulliKL(self): batch_size = 6 a_p = np.array([0.5] * batch_size, dtype=np.float32) b_p = np.array([0.4] * batch_size, dtype=np.float32) a = bernoulli.Bernoulli(probs=a_p) b = bernoulli.Bernoulli(probs=b_p) kl = kullback_leibler.kl_divergence(a, b) kl_val = self.evaluate(kl) kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log( (1. - a_p) / (1. - b_p))) self.assertEqual(kl.get_shape(), (batch_size,)) self.assertAllClose(kl_val, kl_expected)
def testBernoulliBernoulliKL(self): batch_size = 6 a_p = np.array([0.5] * batch_size, dtype=np.float32) b_p = np.array([0.4] * batch_size, dtype=np.float32) a = bernoulli.Bernoulli(probs=a_p) b = bernoulli.Bernoulli(probs=b_p) kl = kullback_leibler.kl_divergence(a, b) kl_val = self.evaluate(kl) kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log( (1. - a_p) / (1. - b_p))) self.assertEqual(kl.get_shape(), (batch_size, )) self.assertAllClose(kl_val, kl_expected)
def test_docstring_example_normal(self): with self.cached_session() as sess: num_draws = int(1e5) mu_p = constant_op.constant(0.) mu_q = constant_op.constant(1.) p = normal_lib.Normal(loc=mu_p, scale=1.) q = normal_lib.Normal(loc=mu_q, scale=2.) exact_kl_normal_normal = kullback_leibler.kl_divergence(p, q) approx_kl_normal_normal = monte_carlo_lib.expectation( f=lambda x: p.log_prob(x) - q.log_prob(x), samples=p.sample(num_draws, seed=42), log_prob=p.log_prob, use_reparametrization=(p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)) [exact_kl_normal_normal_, approx_kl_normal_normal_ ] = sess.run([exact_kl_normal_normal, approx_kl_normal_normal]) self.assertEqual( True, p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED) self.assertAllClose(exact_kl_normal_normal_, approx_kl_normal_normal_, rtol=0.01, atol=0.) # Compare gradients. (Not present in `docstring`.) gradp = lambda fp: gradients_impl.gradients(fp, mu_p)[0] gradq = lambda fq: gradients_impl.gradients(fq, mu_q)[0] [ gradp_exact_kl_normal_normal_, gradq_exact_kl_normal_normal_, gradp_approx_kl_normal_normal_, gradq_approx_kl_normal_normal_, ] = sess.run([ gradp(exact_kl_normal_normal), gradq(exact_kl_normal_normal), gradp(approx_kl_normal_normal), gradq(approx_kl_normal_normal), ]) self.assertAllClose(gradp_exact_kl_normal_normal_, gradp_approx_kl_normal_normal_, rtol=0.01, atol=0.) self.assertAllClose(gradq_exact_kl_normal_normal_, gradq_approx_kl_normal_normal_, rtol=0.01, atol=0.)
def testNormalNormalKL(self): batch_size = 6 mu_a = np.array([3.0] * batch_size) sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5]) mu_b = np.array([-3.0] * batch_size) sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]) n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a) n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b) kl = kullback_leibler.kl_divergence(n_a, n_b) kl_val = self.evaluate(kl) kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * ( (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b))) self.assertEqual(kl.get_shape(), (batch_size,)) self.assertAllClose(kl_val, kl_expected)
def testNormalNormalKL(self): batch_size = 6 mu_a = np.array([3.0] * batch_size) sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5]) mu_b = np.array([-3.0] * batch_size) sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]) n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a) n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b) kl = kullback_leibler.kl_divergence(n_a, n_b) kl_val = self.evaluate(kl) kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * ( (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b))) self.assertEqual(kl.get_shape(), (batch_size, )) self.assertAllClose(kl_val, kl_expected)
def test_docstring_example_gamma(self): with self.test_session() as sess: num_draws = int(1e5) concentration_p = constant_op.constant(1.) concentration_q = constant_op.constant(2.) p = gamma_lib.Gamma(concentration=concentration_p, rate=1.) q = gamma_lib.Gamma(concentration=concentration_q, rate=3.) approx_kl_gamma_gamma = monte_carlo_lib.expectation( f=lambda x: p.log_prob(x) - q.log_prob(x), samples=p.sample(num_draws, seed=42), log_prob=p.log_prob, use_reparametrization=(p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)) exact_kl_gamma_gamma = kullback_leibler.kl_divergence(p, q) [exact_kl_gamma_gamma_, approx_kl_gamma_gamma_] = sess.run([ exact_kl_gamma_gamma, approx_kl_gamma_gamma]) self.assertEqual( False, p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED) self.assertAllClose(exact_kl_gamma_gamma_, approx_kl_gamma_gamma_, rtol=0.01, atol=0.) # Compare gradients. (Not present in `docstring`.) gradp = lambda fp: gradients_impl.gradients(fp, concentration_p)[0] gradq = lambda fq: gradients_impl.gradients(fq, concentration_q)[0] [ gradp_exact_kl_gamma_gamma_, gradq_exact_kl_gamma_gamma_, gradp_approx_kl_gamma_gamma_, gradq_approx_kl_gamma_gamma_, ] = sess.run([ gradp(exact_kl_gamma_gamma), gradq(exact_kl_gamma_gamma), gradp(approx_kl_gamma_gamma), gradq(approx_kl_gamma_gamma), ]) # Notice that variance (i.e., `rtol`) is higher when using score-trick. self.assertAllClose(gradp_exact_kl_gamma_gamma_, gradp_approx_kl_gamma_gamma_, rtol=0.05, atol=0.) self.assertAllClose(gradq_exact_kl_gamma_gamma_, gradq_approx_kl_gamma_gamma_, rtol=0.03, atol=0.)
def test_docstring_example_gamma(self): with self.test_session() as sess: num_draws = int(1e5) concentration_p = constant_op.constant(1.) concentration_q = constant_op.constant(2.) p = gamma_lib.Gamma(concentration=concentration_p, rate=1.) q = gamma_lib.Gamma(concentration=concentration_q, rate=3.) approx_kl_gamma_gamma = monte_carlo_lib.expectation( f=lambda x: p.log_prob(x) - q.log_prob(x), samples=p.sample(num_draws, seed=42), log_prob=p.log_prob, use_reparametrization=(p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)) exact_kl_gamma_gamma = kullback_leibler.kl_divergence(p, q) [exact_kl_gamma_gamma_, approx_kl_gamma_gamma_] = sess.run([ exact_kl_gamma_gamma, approx_kl_gamma_gamma]) self.assertEqual( False, p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED) self.assertAllClose(exact_kl_gamma_gamma_, approx_kl_gamma_gamma_, rtol=0.01, atol=0.) # Compare gradients. (Not present in `docstring`.) gradp = lambda fp: gradients_impl.gradients(fp, concentration_p)[0] gradq = lambda fq: gradients_impl.gradients(fq, concentration_q)[0] [ gradp_exact_kl_gamma_gamma_, gradq_exact_kl_gamma_gamma_, gradp_approx_kl_gamma_gamma_, gradq_approx_kl_gamma_gamma_, ] = sess.run([ gradp(exact_kl_gamma_gamma), gradq(exact_kl_gamma_gamma), gradp(approx_kl_gamma_gamma), gradq(approx_kl_gamma_gamma), ]) # Notice that variance (i.e., `rtol`) is higher when using score-trick. self.assertAllClose(gradp_exact_kl_gamma_gamma_, gradp_approx_kl_gamma_gamma_, rtol=0.05, atol=0.) self.assertAllClose(gradq_exact_kl_gamma_gamma_, gradq_approx_kl_gamma_gamma_, rtol=0.03, atol=0.)
def test_docstring_example_normal(self): with self.test_session() as sess: num_draws = int(1e5) mu_p = constant_op.constant(0.) mu_q = constant_op.constant(1.) p = normal_lib.Normal(loc=mu_p, scale=1.) q = normal_lib.Normal(loc=mu_q, scale=2.) exact_kl_normal_normal = kullback_leibler.kl_divergence(p, q) approx_kl_normal_normal = monte_carlo_lib.expectation( f=lambda x: p.log_prob(x) - q.log_prob(x), samples=p.sample(num_draws, seed=42), log_prob=p.log_prob, use_reparametrization=(p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)) [exact_kl_normal_normal_, approx_kl_normal_normal_] = sess.run([ exact_kl_normal_normal, approx_kl_normal_normal]) self.assertEqual( True, p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED) self.assertAllClose(exact_kl_normal_normal_, approx_kl_normal_normal_, rtol=0.01, atol=0.) # Compare gradients. (Not present in `docstring`.) gradp = lambda fp: gradients_impl.gradients(fp, mu_p)[0] gradq = lambda fq: gradients_impl.gradients(fq, mu_q)[0] [ gradp_exact_kl_normal_normal_, gradq_exact_kl_normal_normal_, gradp_approx_kl_normal_normal_, gradq_approx_kl_normal_normal_, ] = sess.run([ gradp(exact_kl_normal_normal), gradq(exact_kl_normal_normal), gradp(approx_kl_normal_normal), gradq(approx_kl_normal_normal), ]) self.assertAllClose(gradp_exact_kl_normal_normal_, gradp_approx_kl_normal_normal_, rtol=0.01, atol=0.) self.assertAllClose(gradq_exact_kl_normal_normal_, gradq_approx_kl_normal_normal_, rtol=0.01, atol=0.)
def test_kl_forward_multidim(self): with self.test_session() as sess: d = 5 # Dimension p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=self._tridiag( d, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.] * d) approx_kl = cd.monte_carlo_csiszar_f_divergence(f=cd.kl_forward, p=p, q=q, num_draws=int(1e5), seed=1) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_forward(logu, self_normalized=True), p=p, q=q, num_draws=int(1e5), seed=1) exact_kl = kullback_leibler.kl_divergence(p, q) [approx_kl_, approx_kl_self_normalized_, exact_kl_ ] = sess.run([approx_kl, approx_kl_self_normalized, exact_kl]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.06, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.05, atol=0.)
def test_kl_forward_multidim(self): with self.test_session() as sess: d = 5 # Dimension p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.]*d) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_forward, p=p, q=q, num_draws=int(1e5), seed=1) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_forward(logu, self_normalized=True), p=p, q=q, num_draws=int(1e5), seed=1) exact_kl = kullback_leibler.kl_divergence(p, q) [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([ approx_kl, approx_kl_self_normalized, exact_kl]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.06, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.05, atol=0.)
def _elbo(form, log_likelihood, log_joint, variational_with_prior, keep_batch_dim): """Internal implementation of ELBO. Users should use `elbo`. Args: form: ELBOForms constant. Controls how the ELBO is computed. log_likelihood: `Tensor` log p(x|Z). log_joint: `Tensor` log p(x, Z). variational_with_prior: `dict<StochasticTensor, Distribution>`, varational distributions to prior distributions. keep_batch_dim: bool. Whether to keep the batch dimension when reducing the entropy/KL. Returns: ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`. """ ELBOForms.check_form(form) # Order of preference # 1. Analytic KL: log_likelihood - KL(q||p) # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q] # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) = # log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z) def _reduce(val): if keep_batch_dim: return val else: return math_ops.reduce_sum(val) kl_terms = [] entropy_terms = [] prior_terms = [] for q, z, p in [(qz.distribution, qz.value(), pz) for qz, pz in variational_with_prior.items()]: # Analytic KL kl = None if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}: try: kl = kullback_leibler.kl_divergence(q, p) logging.info("Using analytic KL between q:%s, p:%s", q, p) except NotImplementedError as e: if form == ELBOForms.analytic_kl: raise e if kl is not None: kl_terms.append(-1. * _reduce(kl)) continue # Analytic entropy entropy = None if form in {ELBOForms.default, ELBOForms.analytic_entropy}: try: entropy = q.entropy() logging.info("Using analytic entropy for q:%s", q) except NotImplementedError as e: if form == ELBOForms.analytic_entropy: raise e if entropy is not None: entropy_terms.append(_reduce(entropy)) if log_likelihood is not None: prior = p.log_prob(z) prior_terms.append(_reduce(prior)) continue # Sample if form in {ELBOForms.default, ELBOForms.sample}: entropy = -q.log_prob(z) entropy_terms.append(_reduce(entropy)) if log_likelihood is not None: prior = p.log_prob(z) prior_terms.append(_reduce(prior)) first_term = log_joint if log_joint is not None else log_likelihood return sum([first_term] + kl_terms + entropy_terms + prior_terms)
def dense_flipout( inputs, units, activation=None, activity_regularizer=None, trainable=True, kernel_posterior_fn=layers_util.default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), bias_posterior_fn=layers_util.default_mean_field_normal_fn( is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), seed=None, name=None, reuse=None): # pylint: disable=g-doc-args """Densely-connected layer with Flipout estimator. This layer implements the Bayesian variational inference analogue to a dense layer by assuming the `kernel` and/or the `bias` are drawn from distributions. By default, the layer implements a stochastic forward pass via sampling from the kernel and bias posteriors, ```none kernel, bias ~ posterior outputs = activation(matmul(inputs, kernel) + bias) ``` It uses the Flipout estimator [1], which performs a Monte Carlo approximation of the distribution integrating over the `kernel` and `bias`. Flipout uses roughly twice as many floating point operations as the reparameterization estimator but has the advantage of significantly lower variance. The arguments permit separate specification of the surrogate posterior (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` distributions. Args: inputs: Tensor input. @{args} Returns: output: `Tensor` representing a the affine transformed input under a random draw from the surrogate posterior distribution. #### Examples We illustrate a Bayesian neural network with [variational inference]( https://en.wikipedia.org/wiki/Variational_Bayesian_methods), assuming a dataset of `features` and `labels`. ```python tfp = tf.contrib.bayesflow net = tfp.layers.dense_flipout( features, 512, activation=tf.nn.relu) logits = tfp.layers.dense_flipout(net, 10) neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits) kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) loss = neg_log_likelihood + kl train_op = tf.train.AdamOptimizer().minimize(loss) ``` It uses the Flipout gradient estimator to minimize the Kullback-Leibler divergence up to a constant, also known as the negative Evidence Lower Bound. It consists of the sum of two terms: the expected negative log-likelihood, which we approximate via Monte Carlo; and the KL divergence, which is added via regularizer terms which are arguments to the layer. [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on Mini-Batches." Anonymous. OpenReview, 2017. https://openreview.net/forum?id=rJnpifWAb """ # pylint: enable=g-doc-args layer = DenseFlipout(units, activation=activation, activity_regularizer=activity_regularizer, trainable=trainable, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, kernel_prior_fn=kernel_prior_fn, kernel_divergence_fn=kernel_divergence_fn, bias_posterior_fn=bias_posterior_fn, bias_posterior_tensor_fn=bias_posterior_tensor_fn, bias_prior_fn=bias_prior_fn, bias_divergence_fn=bias_divergence_fn, seed=seed, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse) return layer.apply(inputs)
def _kl_divergence(self, other): return kullback_leibler.kl_divergence( self, other, allow_nan_stats=self.allow_nan_stats)
def _build_loss(self, results, features, labels): """Creates the loss operation Returns: tuple `(losses, loss)`: `losses` are the per-batch losses. `loss` is a single scalar tensor to minimize. """ action = labels['action'] discount_reward = labels['discount_reward'] dist_values = labels['dist_values'] tangents = labels.get('tangents') theta = labels.get('theta') old_distribution = self._build_distribution(values=dist_values) log_probs = self._graph_results.distribution.log_prob(action) old_log_probs = old_distribution.log_prob(action) self._losses = tf.multiply(x=tf.exp(log_probs - old_log_probs), y=discount_reward) self._surrogate_loss = -tf.reduce_mean( # pylint: disable=invalid-unary-operand-type self._losses, axis=0, name='surrogate_loss') entropy = self._graph_results.distribution.entropy() self._entropy_loss = tf.reduce_mean(entropy, name='entropy_loss') kl_divergence_value = kl_divergence(self._graph_results.distribution, old_distribution) self._kl_loss = tf.reduce_mean(kl_divergence_value, name='kl_loss') if self.is_continuous: dist_values_fixed = tf.stop_gradient( tf.concat(values=[ self._graph_results.distribution.loc, self._graph_results.distribution.scale ], axis=0)) else: dist_values_fixed = tf.stop_gradient( self._graph_results.distribution.logits) distribution_1_fixed = self._build_distribution( values=dist_values_fixed) kl_divergence_1_fixed = kl_divergence(distribution_1_fixed, self._graph_results.distribution) self._kl_loss_1_fixed = tf.reduce_mean(kl_divergence_1_fixed, name='kl_loss_1_fixed') variables = list(tf.trainable_variables()) self._loss = self._surrogate_loss self._grads_and_vars, self._policy_gradient = self.get_vars_grads( [self._surrogate_loss], variables) offset = 0 list_tangents = [] list_assigns = [] for variable in variables: shape = get_shape(variable) size = np.prod(shape) list_tangents.append( tf.reshape(tangents[offset:offset + size], shape)) list_assigns.append( tf.assign(variable, tf.reshape(theta[offset:offset + size], shape))) offset += size gradients = tf.gradients(self._kl_loss_1_fixed, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, list_tangents) ] _, self._fisher_vector_product = self.get_vars_grads( gradient_vector_product, variables) self._set_theta = tf.group(*list_assigns) self._get_theta = tf.concat( axis=0, values=[tf.reshape(variable, (-1, )) for variable in variables]) return self._losses, self._loss
def _kl_divergence(self, other): return kullback_leibler.kl_divergence( self, other, allow_nan_stats=self.allow_nan_stats)
def test_score_trick(self): with self.test_session() as sess: d = 5 # Dimension num_draws = int(1e5) seed = 1 p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=self._tridiag( d, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. s = array_ops.constant(1.) q = mvn_diag_lib.MultivariateNormalDiag( scale_diag=array_ops.tile([s], [d])) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=num_draws, seed=seed) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=num_draws, seed=seed) approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed) approx_kl_self_normalized_score_trick = ( cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed)) exact_kl = kullback_leibler.kl_divergence(q, p) grad = lambda fs: gradients_impl.gradients(fs, s)[0] [ approx_kl_, approx_kl_self_normalized_, approx_kl_score_trick_, approx_kl_self_normalized_score_trick_, exact_kl_, ] = sess.run([ grad(approx_kl), grad(approx_kl_self_normalized), grad(approx_kl_score_trick), grad(approx_kl_self_normalized_score_trick), grad(exact_kl), ]) self.assertAllClose(approx_kl_, exact_kl_, rtol=0.06, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.05, atol=0.) self.assertAllClose(approx_kl_score_trick_, exact_kl_, rtol=0.06, atol=0.) self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_, rtol=0.05, atol=0.)
def dense_variational( inputs, units, activation=None, activity_regularizer=None, trainable=True, kernel_use_local_reparameterization=True, kernel_posterior_fn=default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, reuse=None): """Densely-connected variational layer. This layer implements the Bayesian variational inference analogue to: `outputs = activation(matmul(inputs, kernel) + bias)` by assuming the `kernel` and/or the `bias` are random variables. The layer implements a stochastic dense calculation by making a Monte Carlo approximation of a [variational Bayesian method based on KL divergence]( https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e., ```none -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw <= E_q(W|x)[-log p(y,W|x) + log q(W|x)] # Jensen's = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)] ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m } + KL[q(W|x), p(W)] ``` where `W` denotes the (independent) `kernel` and `bias` random variables, `w` is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`, and `~=` denotes an approximation which becomes exact as `m->inf`. The above bound is sometimes referred to as the negative Evidence Lower BOund or negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this layer is appropriate to use when the final loss is a negative log-likelihood. The Monte-Carlo sum portion is used for the feed-forward calculation of the DNN. The KL divergence portion can be added to the final loss via: `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`. The arguments permit separate specification of the surrogate posterior (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` random variables (which together comprise `W`). Args: inputs: Tensor input. units: Integer or Long, dimensionality of the output space. activation: Activation function (`callable`). Set it to None to maintain a linear activation. activity_regularizer: Regularizer function for the output. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). kernel_use_local_reparameterization: Python `bool` indicating whether `kernel` calculation should employ the Local Reparameterization Trick. When `True`, `kernel_posterior_fn` must create an instance of `tf.distributions.Normal`. kernel_posterior_fn: Python `callable` which creates `tf.distributions.Distribution` instance representing the surrogate posterior of the `kernel` parameter. Default value: `default_mean_field_normal_fn()`. kernel_posterior_tensor_fn: Python `callable` which takes a `tf.distributions.Distribution` instance and returns a representative value. Default value: `lambda d: d.sample()`. kernel_prior_fn: Python `callable` which creates `tf.distributions` instance. See `default_mean_field_normal_fn` docstring for required parameter signature. Default value: `tf.distributions.Normal(loc=0., scale=1.)`. kernel_divergence_fn: Python `callable` which takes the surrogate posterior distribution, prior distribution and random variate sample(s) from the surrogate posterior and computes or approximates the KL divergence. The distributions are `tf.distributions.Distribution`-like instances and the sample is a `Tensor`. bias_posterior_fn: Python `callable` which creates `tf.distributions.Distribution` instance representing the surrogate posterior of the `bias` parameter. Default value: `default_mean_field_normal_fn(is_singular=True)` (which creates an instance of `tf.distributions.Deterministic`). bias_posterior_tensor_fn: Python `callable` which takes a `tf.distributions.Distribution` instance and returns a representative value. Default value: `lambda d: d.sample()`. bias_prior_fn: Python `callable` which creates `tf.distributions` instance. See `default_mean_field_normal_fn` docstring for required parameter signature. Default value: `None` (no prior, no variational inference) bias_divergence_fn: Python `callable` which takes the surrogate posterior distribution, prior distribution and random variate sample(s) from the surrogate posterior and computes or approximates the KL divergence. The distributions are `tf.distributions.Distribution`-like instances and the sample is a `Tensor`. name: Python `str`, the name of the layer. Layers with the same name will share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in such cases. reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous layer by the same name. Returns: output: `Tensor` representing a the affine transformed input under a random draw from the surrogate posterior distribution. """ layer = DenseVariational( units, activation=activation, activity_regularizer=activity_regularizer, trainable=trainable, kernel_use_local_reparameterization=( kernel_use_local_reparameterization), kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, kernel_prior_fn=kernel_prior_fn, kernel_divergence_fn=kernel_divergence_fn, bias_posterior_fn=bias_posterior_fn, bias_posterior_tensor_fn=bias_posterior_tensor_fn, bias_prior_fn=bias_prior_fn, bias_divergence_fn=bias_divergence_fn, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse) return layer.apply(inputs)
def _elbo(form, log_likelihood, log_joint, variational_with_prior, keep_batch_dim): """Internal implementation of ELBO. Users should use `elbo`. Args: form: ELBOForms constant. Controls how the ELBO is computed. log_likelihood: `Tensor` log p(x|Z). log_joint: `Tensor` log p(x, Z). variational_with_prior: `dict<StochasticTensor, Distribution>`, varational distributions to prior distributions. keep_batch_dim: bool. Whether to keep the batch dimension when reducing the entropy/KL. Returns: ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`. """ ELBOForms.check_form(form) # Order of preference # 1. Analytic KL: log_likelihood - KL(q||p) # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q] # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) = # log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z) def _reduce(val): if keep_batch_dim: return val else: return math_ops.reduce_sum(val) kl_terms = [] entropy_terms = [] prior_terms = [] for q, z, p in [(qz.distribution, qz.value(), pz) for qz, pz in variational_with_prior.items()]: # Analytic KL kl = None if log_joint is None and form in { ELBOForms.default, ELBOForms.analytic_kl }: try: kl = kullback_leibler.kl_divergence(q, p) logging.info("Using analytic KL between q:%s, p:%s", q, p) except NotImplementedError as e: if form == ELBOForms.analytic_kl: raise e if kl is not None: kl_terms.append(-1. * _reduce(kl)) continue # Analytic entropy entropy = None if form in {ELBOForms.default, ELBOForms.analytic_entropy}: try: entropy = q.entropy() logging.info("Using analytic entropy for q:%s", q) except NotImplementedError as e: if form == ELBOForms.analytic_entropy: raise e if entropy is not None: entropy_terms.append(_reduce(entropy)) if log_likelihood is not None: prior = p.log_prob(z) prior_terms.append(_reduce(prior)) continue # Sample if form in {ELBOForms.default, ELBOForms.sample}: entropy = -q.log_prob(z) entropy_terms.append(_reduce(entropy)) if log_likelihood is not None: prior = p.log_prob(z) prior_terms.append(_reduce(prior)) first_term = log_joint if log_joint is not None else log_likelihood return sum([first_term] + kl_terms + entropy_terms + prior_terms)
def test_score_trick(self): with self.test_session() as sess: d = 5 # Dimension num_draws = int(1e5) seed = 1 p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. s = array_ops.constant(1.) q = mvn_diag_lib.MultivariateNormalDiag( scale_diag=array_ops.tile([s], [d])) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=num_draws, seed=seed) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=num_draws, seed=seed) approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p=p, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed) approx_kl_self_normalized_score_trick = ( cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p=p, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed)) exact_kl = kullback_leibler.kl_divergence(q, p) grad = lambda fs: gradients_impl.gradients(fs, s)[0] [ approx_kl_grad_, approx_kl_self_normalized_grad_, approx_kl_score_trick_grad_, approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_, approx_kl_, approx_kl_self_normalized_, approx_kl_score_trick_, approx_kl_self_normalized_score_trick_, exact_kl_, ] = sess.run([ grad(approx_kl), grad(approx_kl_self_normalized), grad(approx_kl_score_trick), grad(approx_kl_self_normalized_score_trick), grad(exact_kl), approx_kl, approx_kl_self_normalized, approx_kl_score_trick, approx_kl_self_normalized_score_trick, exact_kl, ]) # Test average divergence. self.assertAllClose(approx_kl_, exact_kl_, rtol=0.02, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.08, atol=0.) self.assertAllClose(approx_kl_score_trick_, exact_kl_, rtol=0.02, atol=0.) self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_, rtol=0.08, atol=0.) # Test average gradient-divergence. self.assertAllClose(approx_kl_grad_, exact_kl_grad_, rtol=0.007, atol=0.) self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_, rtol=0.011, atol=0.) self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_, rtol=0.018, atol=0.) self.assertAllClose( approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_, rtol=0.017, atol=0.)
def dense_local_reparameterization( inputs, units, activation=None, activity_regularizer=None, trainable=True, kernel_posterior_fn=layers_util.default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), bias_posterior_fn=layers_util.default_mean_field_normal_fn( is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, reuse=None): # pylint: disable=g-doc-args """Densely-connected layer with local reparameterization estimator. This layer implements the Bayesian variational inference analogue to a dense layer by assuming the `kernel` and/or the `bias` are drawn from distributions. By default, the layer implements a stochastic forward pass via sampling from the kernel and bias posteriors, ```none kernel, bias ~ posterior outputs = activation(matmul(inputs, kernel) + bias) ``` It uses the local reparameterization estimator [1], which performs a Monte Carlo approximation of the distribution on the hidden units induced by the `kernel` and `bias`. The arguments permit separate specification of the surrogate posterior (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` distributions. Args: inputs: Tensor input. @{args} Returns: output: `Tensor` representing a the affine transformed input under a random draw from the surrogate posterior distribution. #### Examples We illustrate a Bayesian neural network with [variational inference]( https://en.wikipedia.org/wiki/Variational_Bayesian_methods), assuming a dataset of `features` and `labels`. ```python tfp = tf.contrib.bayesflow net = tfp.layers.dense_local_reparameterization( features, 512, activation=tf.nn.relu) logits = tfp.layers.dense_local_reparameterization(net, 10) neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits) kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) loss = neg_log_likelihood + kl train_op = tf.train.AdamOptimizer().minimize(loss) ``` It uses local reparameterization gradients to minimize the Kullback-Leibler divergence up to a constant, also known as the negative Evidence Lower Bound. It consists of the sum of two terms: the expected negative log-likelihood, which we approximate via Monte Carlo; and the KL divergence, which is added via regularizer terms which are arguments to the layer. [1]: "Variational Dropout and the Local Reparameterization Trick." Diederik P. Kingma, Tim Salimans, Max Welling. Neural Information Processing Systems, 2015. """ # pylint: enable=g-doc-args layer = DenseLocalReparameterization( units, activation=activation, activity_regularizer=activity_regularizer, trainable=trainable, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, kernel_prior_fn=kernel_prior_fn, kernel_divergence_fn=kernel_divergence_fn, bias_posterior_fn=bias_posterior_fn, bias_posterior_tensor_fn=bias_posterior_tensor_fn, bias_prior_fn=bias_prior_fn, bias_divergence_fn=bias_divergence_fn, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse) return layer.apply(inputs)
def dense_variational( inputs, units, activation=None, activity_regularizer=None, trainable=True, kernel_use_local_reparameterization=True, kernel_posterior_fn=default_mean_field_normal_fn(), kernel_posterior_tensor_fn=lambda d: d.sample(), kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), bias_posterior_tensor_fn=lambda d: d.sample(), bias_prior_fn=None, bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), name=None, reuse=None): """Densely-connected variational layer. This layer implements the Bayesian variational inference analogue to: `outputs = activation(matmul(inputs, kernel) + bias)` by assuming the `kernel` and/or the `bias` are random variables. The layer implements a stochastic dense calculation by making a Monte Carlo approximation of a [variational Bayesian method based on KL divergence]( https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e., ```none -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw <= E_q(W|x)[-log p(y,W|x) + log q(W|x)] # Jensen's = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)] ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m } + KL[q(W|x), p(W)] ``` where `W` denotes the (independent) `kernel` and `bias` random variables, `w` is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`, and `~=` denotes an approximation which becomes exact as `m->inf`. The above bound is sometimes referred to as the negative Evidence Lower BOund or negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this layer is appropriate to use when the final loss is a negative log-likelihood. The Monte-Carlo sum portion is used for the feed-forward calculation of the DNN. The KL divergence portion can be added to the final loss via: `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`. The arguments permit separate specification of the surrogate posterior (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` random variables (which together comprise `W`). Args: inputs: Tensor input. units: Integer or Long, dimensionality of the output space. activation: Activation function (`callable`). Set it to None to maintain a linear activation. activity_regularizer: Regularizer function for the output. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). kernel_use_local_reparameterization: Python `bool` indicating whether `kernel` calculation should employ the Local Reparameterization Trick. When `True`, `kernel_posterior_fn` must create an instance of `tf.distributions.Normal`. kernel_posterior_fn: Python `callable` which creates `tf.distributions.Distribution` instance representing the surrogate posterior of the `kernel` parameter. Default value: `default_mean_field_normal_fn()`. kernel_posterior_tensor_fn: Python `callable` which takes a `tf.distributions.Distribution` instance and returns a representative value. Default value: `lambda d: d.sample()`. kernel_prior_fn: Python `callable` which creates `tf.distributions` instance. See `default_mean_field_normal_fn` docstring for required parameter signature. Default value: `tf.distributions.Normal(loc=0., scale=1.)`. kernel_divergence_fn: Python `callable` which takes the surrogate posterior distribution, prior distribution and random variate sample(s) from the surrogate posterior and computes or approximates the KL divergence. The distributions are `tf.distributions.Distribution`-like instances and the sample is a `Tensor`. bias_posterior_fn: Python `callable` which creates `tf.distributions.Distribution` instance representing the surrogate posterior of the `bias` parameter. Default value: `default_mean_field_normal_fn(is_singular=True)` (which creates an instance of `tf.distributions.Deterministic`). bias_posterior_tensor_fn: Python `callable` which takes a `tf.distributions.Distribution` instance and returns a representative value. Default value: `lambda d: d.sample()`. bias_prior_fn: Python `callable` which creates `tf.distributions` instance. See `default_mean_field_normal_fn` docstring for required parameter signature. Default value: `None` (no prior, no variational inference) bias_divergence_fn: Python `callable` which takes the surrogate posterior distribution, prior distribution and random variate sample(s) from the surrogate posterior and computes or approximates the KL divergence. The distributions are `tf.distributions.Distribution`-like instances and the sample is a `Tensor`. name: Python `str`, the name of the layer. Layers with the same name will share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in such cases. reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous layer by the same name. Returns: output: `Tensor` representing a the affine transformed input under a random draw from the surrogate posterior distribution. """ layer = DenseVariational( units, activation=activation, activity_regularizer=activity_regularizer, trainable=trainable, kernel_use_local_reparameterization=( kernel_use_local_reparameterization), kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, kernel_prior_fn=kernel_prior_fn, kernel_divergence_fn=kernel_divergence_fn, bias_posterior_fn=bias_posterior_fn, bias_posterior_tensor_fn=bias_posterior_tensor_fn, bias_prior_fn=bias_prior_fn, bias_divergence_fn=bias_divergence_fn, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse) return layer.apply(inputs)