Exemple #1
0
def is_loglikelihood(log_joint, observed, latent, axis=None):
    """
    Marginal log likelihood (:math:`\log p(x)`) estimates using self-normalized
    importance sampling.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values
    :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in the importance sampling estimation. If None, no
        dimension is reduced.

    :return: A Tensor. The estimated log likelihood of observed data.
    """
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_w = log_joint(joint_obs) - sum(latent_logpdfs)
    if axis is not None:
        return log_mean_exp(log_w, axis)
    return log_w
def rws(log_joint, observed, latent, axis=None):
    """
    Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for
    both continuous and discrete latent `StochasticTensor` s.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in log likelihood and in the cost for adapting
        proposals. If `None`, no dimension is reduced.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. Estimated log likelihoods.
    """
    warnings.warn(
        "rws(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. Features of the original rws() can be "
        "achieved by two new variational objectives. For learning "
        "model parameters, please use the importance weighted "
        "objective: `zs.variational.iw_objective()`. For adapting "
        "the proposal, the new rws gradient estimator can be "
        "accessed by first constructing the inclusive KL divergence "
        "objective using `zs.variational.klpq` and then calling "
        "its rws() method.",
        category=FutureWarning)
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    log_w = log_joint_value + entropy
    if axis is not None:
        log_w_max = tf.reduce_max(log_w, axis, keep_dims=True)
        w_u = tf.exp(log_w - log_w_max)
        w_tilde = tf.stop_gradient(w_u /
                                   tf.reduce_sum(w_u, axis, keep_dims=True))
        log_likelihood = log_mean_exp(log_w, axis)
        fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis)
        fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis)
        cost = fake_log_joint_cost + fake_proposal_cost
    else:
        cost = log_w
        log_likelihood = log_w
    return cost, log_likelihood
Exemple #3
0
def rws(log_joint, observed, latent, axis=None):
    """
    Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for
    both continuous and discrete latent `StochasticTensor` s.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in log likelihood and in the cost for adapting
        proposals. If `None`, no dimension is reduced.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. Estimated log likelihoods.
    """
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    log_w = log_joint_value + entropy
    if axis is not None:
        log_w_max = tf.reduce_max(log_w, axis, keep_dims=True)
        w_u = tf.exp(log_w - log_w_max)
        w_tilde = tf.stop_gradient(w_u /
                                   tf.reduce_sum(w_u, axis, keep_dims=True))
        log_likelihood = log_mean_exp(log_w, axis)
        fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis)
        fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis)
        cost = fake_log_joint_cost + fake_proposal_cost
    else:
        cost = log_w
        log_likelihood = log_w
    return cost, log_likelihood
def vimco(log_joint, observed, latent, axis=None):
    """
    Implements the multi-sample variance reduced score function estimator for
    gradients of the variational lower bound from (Minh, 2016). This works for
    both continuous and discrete latent `StochasticTensor` s.

    .. note::

        :func:`vimco` is a multi-sample objective, size along `axis` in the
        objective should be larger than 1, else an error is raised.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension to reduce when computing the
        outer expectation in variational lower bound. Must be specified. If
        `None`, an error is raised.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. The variational lower bound.
    """
    warnings.warn(
        "vimco(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. The new vimco gradient estimator can be "
        "accessed by first constructing the importance weighted "
        "objective (using `zs.variational.iw_objective` and then "
        "calling its vimco() method.",
        category=FutureWarning)
    if axis is None:
        raise ValueError("vimco is a multi-sample objective, "
                         "the 'axis' argument must be specified.")

    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    l_signal = log_joint_value + entropy

    # check size along the sample axis
    err_msg = "vimco() is a multi-sample objective, " \
              "size along 'axis' in the objective should be larger than 1."
    if l_signal.get_shape()[axis:axis + 1].is_fully_defined():
        if l_signal.get_shape()[axis].value < 2:
            raise ValueError(err_msg)
    _assert_size_along_axis = tf.assert_greater_equal(tf.shape(l_signal)[axis],
                                                      2,
                                                      message=err_msg)
    with tf.control_dependencies([_assert_size_along_axis]):
        l_signal = tf.identity(l_signal)

    # compute variance reduction term
    mean_except_signal = (
        tf.reduce_sum(l_signal, axis, keep_dims=True) -
        l_signal) / tf.to_float(tf.shape(l_signal)[axis] - 1)
    x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal)

    n_dim = tf.rank(x)
    axis_dim_mask = tf.cast(tf.one_hot(axis, n_dim), tf.bool)
    original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool)
    axis_dim = tf.ones([n_dim], tf.int32) * axis
    originals = tf.ones([n_dim], tf.int32) * (n_dim - 1)
    perm = tf.where(original_mask, axis_dim, tf.range(n_dim))
    perm = tf.where(axis_dim_mask, originals, perm)
    multiples = tf.concat([tf.ones([n_dim], tf.int32), [tf.shape(x)[axis]]], 0)

    x = tf.transpose(x, perm=perm)
    sub_x = tf.transpose(sub_x, perm=perm)
    x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples)
    x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x)
    control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1), perm=perm)

    # variance reduced objective
    l_signal = log_mean_exp(l_signal, axis, keep_dims=True) - control_variate
    fake_term = tf.reduce_sum(-entropy * tf.stop_gradient(l_signal), axis)
    lower_bound = log_mean_exp(log_joint_value + entropy, axis)
    cost = -fake_term - log_mean_exp(log_joint_value + entropy, axis)

    return cost, lower_bound
Exemple #5
0
    def vimco(self):
        """
        Implements the multi-sample score function gradient estimator for
        the objective, also known as "VIMCO", which is named
        by authors of the original paper (Minh, 2016).

        It works for all kinds of latent `StochasticTensor` s.

        .. note::

            To use the :meth:`vimco` estimator, the ``is_reparameterized``
            property of each reparameterizable latent `StochasticTensor` must
            be set False.

        :return: A Tensor. The surrogate cost for Tensorflow optimizers to
            minimize.
        """
        log_w = self._log_joint_term() + self._entropy_term()
        l_signal = log_w

        # check size along the sample axis
        err_msg = "VIMCO is a multi-sample gradient estimator, size along " \
                  "`axis` in the objective should be larger than 1."
        if l_signal.get_shape()[self._axis:self._axis + 1].is_fully_defined():
            if l_signal.get_shape()[self._axis].value < 2:
                raise ValueError(err_msg)
        _assert_size_along_axis = tf.assert_greater_equal(
            tf.shape(l_signal)[self._axis], 2, message=err_msg)
        with tf.control_dependencies([_assert_size_along_axis]):
            l_signal = tf.identity(l_signal)

        # compute variance reduction term
        mean_except_signal = (
            tf.reduce_sum(l_signal, self._axis, keep_dims=True) -
            l_signal) / tf.to_float(tf.shape(l_signal)[self._axis] - 1)
        x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal)

        n_dim = tf.rank(x)
        axis_dim_mask = tf.cast(tf.one_hot(self._axis, n_dim), tf.bool)
        original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool)
        axis_dim = tf.ones([n_dim], tf.int32) * self._axis
        originals = tf.ones([n_dim], tf.int32) * (n_dim - 1)
        perm = tf.where(original_mask, axis_dim, tf.range(n_dim))
        perm = tf.where(axis_dim_mask, originals, perm)
        multiples = tf.concat(
            [tf.ones([n_dim], tf.int32), [tf.shape(x)[self._axis]]], 0)

        x = tf.transpose(x, perm=perm)
        sub_x = tf.transpose(sub_x, perm=perm)
        x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples)
        x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x)
        control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1),
                                       perm=perm)

        # variance reduced objective
        l_signal = log_mean_exp(l_signal, self._axis,
                                keep_dims=True) - control_variate
        fake_term = tf.reduce_sum(
            -self._entropy_term() * tf.stop_gradient(l_signal), self._axis)
        cost = -fake_term - log_mean_exp(log_w, self._axis)

        return cost
Exemple #6
0
 def _objective(self):
     log_w = self._log_joint_term() + self._entropy_term()
     if self._axis is not None:
         return log_mean_exp(log_w, self._axis)
     return log_w