def is_loglikelihood(log_joint, observed, latent, axis=None): """ Marginal log likelihood (:math:`\log p(x)`) estimates using self-normalized importance sampling. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in the importance sampling estimation. If None, no dimension is reduced. :return: A Tensor. The estimated log likelihood of observed data. """ latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_w = log_joint(joint_obs) - sum(latent_logpdfs) if axis is not None: return log_mean_exp(log_w, axis) return log_w
def rws(log_joint, observed, latent, axis=None): """ Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for both continuous and discrete latent `StochasticTensor` s. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in log likelihood and in the cost for adapting proposals. If `None`, no dimension is reduced. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. Estimated log likelihoods. """ warnings.warn( "rws(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. Features of the original rws() can be " "achieved by two new variational objectives. For learning " "model parameters, please use the importance weighted " "objective: `zs.variational.iw_objective()`. For adapting " "the proposal, the new rws gradient estimator can be " "accessed by first constructing the inclusive KL divergence " "objective using `zs.variational.klpq` and then calling " "its rws() method.", category=FutureWarning) latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) log_w = log_joint_value + entropy if axis is not None: log_w_max = tf.reduce_max(log_w, axis, keep_dims=True) w_u = tf.exp(log_w - log_w_max) w_tilde = tf.stop_gradient(w_u / tf.reduce_sum(w_u, axis, keep_dims=True)) log_likelihood = log_mean_exp(log_w, axis) fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis) fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis) cost = fake_log_joint_cost + fake_proposal_cost else: cost = log_w log_likelihood = log_w return cost, log_likelihood
def rws(log_joint, observed, latent, axis=None): """ Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for both continuous and discrete latent `StochasticTensor` s. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in log likelihood and in the cost for adapting proposals. If `None`, no dimension is reduced. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. Estimated log likelihoods. """ latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) log_w = log_joint_value + entropy if axis is not None: log_w_max = tf.reduce_max(log_w, axis, keep_dims=True) w_u = tf.exp(log_w - log_w_max) w_tilde = tf.stop_gradient(w_u / tf.reduce_sum(w_u, axis, keep_dims=True)) log_likelihood = log_mean_exp(log_w, axis) fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis) fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis) cost = fake_log_joint_cost + fake_proposal_cost else: cost = log_w log_likelihood = log_w return cost, log_likelihood
def vimco(log_joint, observed, latent, axis=None): """ Implements the multi-sample variance reduced score function estimator for gradients of the variational lower bound from (Minh, 2016). This works for both continuous and discrete latent `StochasticTensor` s. .. note:: :func:`vimco` is a multi-sample objective, size along `axis` in the objective should be larger than 1, else an error is raised. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension to reduce when computing the outer expectation in variational lower bound. Must be specified. If `None`, an error is raised. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. The variational lower bound. """ warnings.warn( "vimco(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. The new vimco gradient estimator can be " "accessed by first constructing the importance weighted " "objective (using `zs.variational.iw_objective` and then " "calling its vimco() method.", category=FutureWarning) if axis is None: raise ValueError("vimco is a multi-sample objective, " "the 'axis' argument must be specified.") latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) l_signal = log_joint_value + entropy # check size along the sample axis err_msg = "vimco() is a multi-sample objective, " \ "size along 'axis' in the objective should be larger than 1." if l_signal.get_shape()[axis:axis + 1].is_fully_defined(): if l_signal.get_shape()[axis].value < 2: raise ValueError(err_msg) _assert_size_along_axis = tf.assert_greater_equal(tf.shape(l_signal)[axis], 2, message=err_msg) with tf.control_dependencies([_assert_size_along_axis]): l_signal = tf.identity(l_signal) # compute variance reduction term mean_except_signal = ( tf.reduce_sum(l_signal, axis, keep_dims=True) - l_signal) / tf.to_float(tf.shape(l_signal)[axis] - 1) x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal) n_dim = tf.rank(x) axis_dim_mask = tf.cast(tf.one_hot(axis, n_dim), tf.bool) original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool) axis_dim = tf.ones([n_dim], tf.int32) * axis originals = tf.ones([n_dim], tf.int32) * (n_dim - 1) perm = tf.where(original_mask, axis_dim, tf.range(n_dim)) perm = tf.where(axis_dim_mask, originals, perm) multiples = tf.concat([tf.ones([n_dim], tf.int32), [tf.shape(x)[axis]]], 0) x = tf.transpose(x, perm=perm) sub_x = tf.transpose(sub_x, perm=perm) x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples) x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x) control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1), perm=perm) # variance reduced objective l_signal = log_mean_exp(l_signal, axis, keep_dims=True) - control_variate fake_term = tf.reduce_sum(-entropy * tf.stop_gradient(l_signal), axis) lower_bound = log_mean_exp(log_joint_value + entropy, axis) cost = -fake_term - log_mean_exp(log_joint_value + entropy, axis) return cost, lower_bound
def vimco(self): """ Implements the multi-sample score function gradient estimator for the objective, also known as "VIMCO", which is named by authors of the original paper (Minh, 2016). It works for all kinds of latent `StochasticTensor` s. .. note:: To use the :meth:`vimco` estimator, the ``is_reparameterized`` property of each reparameterizable latent `StochasticTensor` must be set False. :return: A Tensor. The surrogate cost for Tensorflow optimizers to minimize. """ log_w = self._log_joint_term() + self._entropy_term() l_signal = log_w # check size along the sample axis err_msg = "VIMCO is a multi-sample gradient estimator, size along " \ "`axis` in the objective should be larger than 1." if l_signal.get_shape()[self._axis:self._axis + 1].is_fully_defined(): if l_signal.get_shape()[self._axis].value < 2: raise ValueError(err_msg) _assert_size_along_axis = tf.assert_greater_equal( tf.shape(l_signal)[self._axis], 2, message=err_msg) with tf.control_dependencies([_assert_size_along_axis]): l_signal = tf.identity(l_signal) # compute variance reduction term mean_except_signal = ( tf.reduce_sum(l_signal, self._axis, keep_dims=True) - l_signal) / tf.to_float(tf.shape(l_signal)[self._axis] - 1) x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal) n_dim = tf.rank(x) axis_dim_mask = tf.cast(tf.one_hot(self._axis, n_dim), tf.bool) original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool) axis_dim = tf.ones([n_dim], tf.int32) * self._axis originals = tf.ones([n_dim], tf.int32) * (n_dim - 1) perm = tf.where(original_mask, axis_dim, tf.range(n_dim)) perm = tf.where(axis_dim_mask, originals, perm) multiples = tf.concat( [tf.ones([n_dim], tf.int32), [tf.shape(x)[self._axis]]], 0) x = tf.transpose(x, perm=perm) sub_x = tf.transpose(sub_x, perm=perm) x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples) x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x) control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1), perm=perm) # variance reduced objective l_signal = log_mean_exp(l_signal, self._axis, keep_dims=True) - control_variate fake_term = tf.reduce_sum( -self._entropy_term() * tf.stop_gradient(l_signal), self._axis) cost = -fake_term - log_mean_exp(log_w, self._axis) return cost
def _objective(self): log_w = self._log_joint_term() + self._entropy_term() if self._axis is not None: return log_mean_exp(log_w, self._axis) return log_w