Exemple #1
0
def is_loglikelihood(log_joint, observed, latent, axis=None):
    """
    Marginal log likelihood (:math:`\log p(x)`) estimates using self-normalized
    importance sampling.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values
    :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in the importance sampling estimation. If None, no
        dimension is reduced.

    :return: A Tensor. The estimated log likelihood of observed data.
    """
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_w = log_joint(joint_obs) - sum(latent_logpdfs)
    if axis is not None:
        return log_mean_exp(log_w, axis)
    return log_w
Exemple #2
0
    def __init__(self, log_prior, log_joint, prior_sampler, hmc, observed,
                 latent, n_chains, n_temperatures):
        # Shape of latent: [chain_axis * num_data, data dims]
        # Construct the tempered objective
        self.prior_sampler = prior_sampler
        self.latent = latent
        self.n_chains = n_chains
        self.n_temperatures = n_temperatures

        with tf.name_scope("BDMC"):
            self.temperature = tf.placeholder(tf.float32,
                                              shape=[],
                                              name="temperature")

            def log_fn(observed):
                return log_prior(observed) * (1 - self.temperature) + \
                       log_joint(observed) * self.temperature

            self.log_fn = log_fn
            self.log_fn_val = log_fn(merge_dicts(observed, latent))
            self.sample_op, self.hmc_info = hmc.sample(log_fn, observed,
                                                       latent)
            self.init_latent = [
                tf.assign(z, z_s)
                for z, z_s in zip(latent.values(), self.prior_sampler.values())
            ]
Exemple #3
0
 def _log_joint_term(self):
     if self._meta_bn:
         return self.bn.log_joint()
     elif not hasattr(self, '_log_joint_cache'):
         self._log_joint_cache = self._log_joint(
             merge_dicts(self._v_inputs, self._observed))
     return self._log_joint_cache
Exemple #4
0
def sgvb(log_joint, observed, latent, axis=None):
    """
    Implements the stochastic gradient variational bayes (SGVB) algorithm
    from (Kingma, 2013). This only works for continuous latent
    `StochasticTensor` s that can be reparameterized (Kingma, 2013).

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values
    :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in variational lower bound. If `None`, no dimension
        is reduced.

    :return: A Tensor. The variational lower bound.
    """
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    lower_bound = log_joint(joint_obs) - sum(latent_logpdfs)
    if axis is not None:
        lower_bound = tf.reduce_mean(lower_bound, axis)
    return lower_bound
Exemple #5
0
    def __init__(self, meta_bn, proposal_meta_bn, hmc, observed, latent,
                 n_temperatures=1000, n_adapt=30, verbose=False):
        # Shape of latent: [chain_axis, num_data, data dims]
        # Construct the tempered objective
        self._n_temperatures = n_temperatures
        self._n_adapt = n_adapt
        self._verbose = verbose

        with tf.name_scope("AIS"):
            if callable(meta_bn):
                log_joint = meta_bn
            else:
                log_joint = lambda obs: meta_bn.observe(**obs).log_joint()

            latent_k, latent_v = zip(*six.iteritems(latent))

            prior_samples = proposal_meta_bn.observe().get(latent_k)
            log_prior = lambda obs: proposal_meta_bn.observe(**obs).log_joint()

            self.temperature = tf.placeholder(tf.float32, shape=[],
                                              name="temperature")

            def log_fn(observed):
                return log_prior(observed) * (1 - self.temperature) + \
                    log_joint(observed) * self.temperature

            self.log_fn = log_fn
            self.log_fn_val = log_fn(merge_dicts(observed, latent))
            self.sample_op, self.hmc_info = hmc.sample(
                log_fn, observed, latent)
            self.init_latent = [tf.assign(z, z_s)
                                for z, z_s in zip(latent_v, prior_samples)]
Exemple #6
0
    def run(self, sess, feed_dict):
        """
        Run the AIS loop.

        :param sess: A Tensorflow session.
        :param feed_dict: The `feed_dict` argument for ``session.run``.

        :return: The log marginal likelihood estimate.
        """
        # Help adapt the hmc size
        adp_num_t = 2 if self._n_temperatures > 1 else 1
        adp_t = self._get_schedule_t(adp_num_t)
        sess.run(self.init_latent, feed_dict=feed_dict)
        for i in range(self._n_adapt):
            _, acc = sess.run([self.sample_op, self.hmc_info.acceptance_rate],
                              feed_dict=merge_dicts(feed_dict,
                                                    {self.temperature: adp_t}))
            if self._verbose:
                print('Adapt iter {}, acc = {:.3f}'.format(i, np.mean(acc)))

        # Draw a sample from the prior
        sess.run(self.init_latent, feed_dict=feed_dict)
        prior_density = sess.run(self.log_fn_val,
                                 feed_dict=merge_dicts(
                                     feed_dict, {self.temperature: 0}))
        log_weights = -prior_density

        for num_t in range(self._n_temperatures):
            # current_temperature = 1.0 / self._n_temperatures * (num_t + 1)
            current_temperature = self._get_schedule_t(num_t + 1)

            _, old_log_p, new_log_p, acc = sess.run(
                [self.sample_op, self.hmc_info.orig_log_prob,
                 self.hmc_info.log_prob, self.hmc_info.acceptance_rate],
                feed_dict=merge_dicts(feed_dict,
                                      {self.temperature: current_temperature}))

            if num_t + 1 < self._n_temperatures:
                log_weights += old_log_p - new_log_p
            else:
                log_weights += old_log_p

            if self._verbose:
                print('Finished step {}, Temperature = {:.4f}, acc = {:.3f}'
                      .format(num_t + 1, current_temperature, np.mean(acc)))

        return np.mean(self._get_lower_bound(log_weights))
Exemple #7
0
    def run(self, sess, feed_dict):
        # Draw a sample from the prior
        sess.run(self.init_latent, feed_dict=feed_dict)
        prior_density = sess.run(self.log_fn_val,
                                 feed_dict=merge_dicts(feed_dict,
                                                       {self.temperature: 0}))
        log_weights = -prior_density

        # Forward AIS
        for num_t in range(self.n_temperatures):
            current_temperature = 1.0 / self.n_temperatures * (num_t + 1)
            new_feed_dict = feed_dict.copy()
            new_feed_dict[self.temperature] = current_temperature
            _, old_log_p, new_log_p = sess.run([
                self.sample_op, self.hmc_info.orig_log_prob,
                self.hmc_info.log_prob
            ],
                                               feed_dict=new_feed_dict)
            if num_t + 1 < self.n_temperatures:
                log_weights += old_log_p - new_log_p
            else:
                log_weights += old_log_p

        ll_lb = np.mean(self.get_lower_bound(log_weights))

        # Backward AIS
        log_weights = -new_log_p
        for num_t in range(self.n_temperatures):
            current_temperature = 1.0 - 1.0 / self.n_temperatures * (num_t + 1)
            _, old_log_p, new_log_p = sess.run(
                [
                    self.sample_op, self.hmc_info.orig_log_prob,
                    self.hmc_info.log_prob
                ],
                feed_dict=merge_dicts(feed_dict,
                                      {self.temperature: current_temperature}))
            if num_t + 1 < self.n_temperatures:
                log_weights += old_log_p - new_log_p
            else:
                log_weights += old_log_p

        ll_ub = -np.mean(self.get_lower_bound(log_weights))

        return ll_lb, ll_ub
Exemple #8
0
    def run(self, sess, feed_dict):
        # Help adapt the hmc size
        n_adp = 30
        adp_num_t = 2 if self.n_temperatures > 1 else 1
        adp_t = self.get_schedule_t(adp_num_t)
        sess.run(self.init_latent, feed_dict=feed_dict)
        for i in range(n_adp):
            _, acc = sess.run([self.sample_op, self.hmc_info.acceptance_rate],
                              feed_dict=merge_dicts(feed_dict,
                                                    {self.temperature: adp_t}))
            if self.verbose:
                print('Adapt iter {}, acc = {:.3f}'.format(i, np.mean(acc)))

        # Draw a sample from the prior
        sess.run(self.init_latent, feed_dict=feed_dict)
        prior_density = sess.run(self.log_fn_val,
                                 feed_dict=merge_dicts(feed_dict,
                                                       {self.temperature: 0}))
        log_weights = -prior_density

        for num_t in range(self.n_temperatures):
            # current_temperature = 1.0 / self.n_temperatures * (num_t + 1)
            current_temperature = self.get_schedule_t(num_t + 1)

            _, old_log_p, new_log_p, acc = sess.run(
                [
                    self.sample_op, self.hmc_info.orig_log_prob,
                    self.hmc_info.log_prob, self.hmc_info.acceptance_rate
                ],
                feed_dict=merge_dicts(feed_dict,
                                      {self.temperature: current_temperature}))

            if num_t + 1 < self.n_temperatures:
                log_weights += old_log_p - new_log_p
            else:
                log_weights += old_log_p

            if self.verbose:
                print('Finished step {}, Temperature = {:.4f}, acc = {:.3f}'.
                      format(num_t + 1, current_temperature, np.mean(acc)))

        return np.mean(self.get_lower_bound(log_weights))
def rws(log_joint, observed, latent, axis=None):
    """
    Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for
    both continuous and discrete latent `StochasticTensor` s.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in log likelihood and in the cost for adapting
        proposals. If `None`, no dimension is reduced.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. Estimated log likelihoods.
    """
    warnings.warn(
        "rws(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. Features of the original rws() can be "
        "achieved by two new variational objectives. For learning "
        "model parameters, please use the importance weighted "
        "objective: `zs.variational.iw_objective()`. For adapting "
        "the proposal, the new rws gradient estimator can be "
        "accessed by first constructing the inclusive KL divergence "
        "objective using `zs.variational.klpq` and then calling "
        "its rws() method.",
        category=FutureWarning)
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    log_w = log_joint_value + entropy
    if axis is not None:
        log_w_max = tf.reduce_max(log_w, axis, keep_dims=True)
        w_u = tf.exp(log_w - log_w_max)
        w_tilde = tf.stop_gradient(w_u /
                                   tf.reduce_sum(w_u, axis, keep_dims=True))
        log_likelihood = log_mean_exp(log_w, axis)
        fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis)
        fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis)
        cost = fake_log_joint_cost + fake_proposal_cost
    else:
        cost = log_w
        log_likelihood = log_w
    return cost, log_likelihood
Exemple #10
0
 def __init__(self, log_joint, observed, latent):
     self._log_joint = log_joint
     self._observed = observed
     self._latent = latent
     # TODO: Add input name check by matching them
     latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
     self._latent_outputs = dict(
         zip(latent_k, map(lambda x: x[0], latent_v)))
     self._latent_logpdfs = dict(
         zip(latent_k, map(lambda x: x[1], latent_v)))
     self._joint_obs = merge_dicts(observed, self._latent_outputs)
     try:
         self._dict_key = (log_joint, frozenset(latent_k),
                           frozenset(map(tuple, latent_v)),
                           frozenset(six.iteritems(observed)))
     except TypeError:
         # Unhashable type
         self._dict_key = None
Exemple #11
0
def rws(log_joint, observed, latent, axis=None):
    """
    Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for
    both continuous and discrete latent `StochasticTensor` s.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in log likelihood and in the cost for adapting
        proposals. If `None`, no dimension is reduced.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. Estimated log likelihoods.
    """
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    log_w = log_joint_value + entropy
    if axis is not None:
        log_w_max = tf.reduce_max(log_w, axis, keep_dims=True)
        w_u = tf.exp(log_w - log_w_max)
        w_tilde = tf.stop_gradient(w_u /
                                   tf.reduce_sum(w_u, axis, keep_dims=True))
        log_likelihood = log_mean_exp(log_w, axis)
        fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis)
        fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis)
        cost = fake_log_joint_cost + fake_proposal_cost
    else:
        cost = log_w
        log_likelihood = log_w
    return cost, log_likelihood
Exemple #12
0
 def log_joint(observed):
     log_pz = []
     log_ph_z = []
     log_px_h = None
     for i in range(n_z):
         z_obs_onehot = tf.one_hot(
             i * tf.ones([n_particles, batch_size], dtype=tf.int32),
             depth=n_z,
             dtype=tf.int32)  # (n_particles, batch_size, n_z)
         ob_dict = merge_dicts(
             observed, {'z': z_obs_onehot})  # sum over all possible z
         model, _, _ = vae(ob_dict, n, n_x, n_h, n_z, n_particles)
         log_pz_i, log_ph_z_i, log_px_h = model.local_log_prob(
             ['z', 'h', 'x'])
         log_pz.append(log_pz_i)
         log_ph_z.append(log_ph_z_i)
     log_pz = tf.stack(log_pz, axis=-1)
     log_ph_z = tf.stack(log_ph_z, axis=-1)
     # P(X,H) = P(X|H) * sum[(P(H|z_i) * P(z_i))]
     return log_px_h + tf.reduce_logsumexp(log_pz + log_ph_z, axis=-1)
Exemple #13
0
    def bn(self):
        """
        The :class:`~zhusuan.framework.bn.BayesianNet` constructed by
        observing the :attr:`meta_bn` with samples from the variational
        posterior distributions. ``None`` if the log joint probability
        function is provided instead of :attr:`meta_bn`.

        .. note::

            This :class:`~zhusuan.framework.bn.BayesianNet` instance is
            useful when computing predictions with the approximate posterior
            distribution.
        """
        if self._meta_bn:
            if not hasattr(self, "_bn"):
                self._bn = self._meta_bn.observe(
                    **merge_dicts(self._v_inputs, self._observed))
                self._validate_variational_inputs(self._bn)
            return self._bn
        else:
            return None
def sgvb(log_joint, observed, latent, axis=None):
    """
    Implements the stochastic gradient variational bayes (SGVB) algorithm
    from (Kingma, 2013). This only works for continuous latent
    `StochasticTensor` s that can be reparameterized (Kingma, 2013).

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values
    :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in variational lower bound. If `None`, no dimension
        is reduced.

    :return: A Tensor. The variational lower bound.
    """
    warnings.warn(
        "sgvb(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. The new sgvb gradient estimator can be "
        "accessed by first constructing the elbo objective (using "
        "`zs.variational.elbo` and then calling its sgvb() method.",
        category=FutureWarning)
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    lower_bound = log_joint(joint_obs) - sum(latent_logpdfs)
    if axis is not None:
        lower_bound = tf.reduce_mean(lower_bound, axis)
    return lower_bound
    def _define_model(self, x_dim):
        y_logstd = -1.

        @zs.meta_bayesian_net(scope="bnn", reuse_variables=True)
        def build_bnn(x, layer_sizes, logstds, n_particles):
            bn = zs.BayesianNet()
            h = tf.tile(x[None, ...], [n_particles, 1, 1])
            for i, (n_in,
                    n_out) in enumerate(zip(layer_sizes[:-1],
                                            layer_sizes[1:])):
                w = bn.normal("w" + str(i),
                              tf.zeros([n_out, n_in + 1]),
                              logstd=logstds[i],
                              group_ndims=2,
                              n_samples=n_particles)
                h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1)
                h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt(
                    tf.cast(tf.shape(h)[2], tf.float32))
                if i < len(layer_sizes) - 2:
                    h = tf.nn.relu(h)

            y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2))
            bn.normal("y", y_mean, logstd=y_logstd)
            return bn

        update_logstd = self.update_logstd

        x = tf.placeholder(tf.float32, shape=[None, x_dim])
        self.x = x
        y = tf.placeholder(tf.float32, shape=[None])
        time_decay = tf.placeholder(tf.float32, shape=[None])
        layer_sizes = [x_dim] + self.hidden_layer_sizes + [1]
        w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)]
        wv = []
        logstds = []
        for n_in, n_out in zip(layer_sizes[:-1], layer_sizes[1:]):
            wv.append(
                tf.Variable(
                    tf.random_uniform([self.n_particles, n_out, n_in + 1]) *
                    4 - 2))

        if update_logstd:
            for n_in, n_out in zip(layer_sizes[:-1], layer_sizes[1:]):
                logstds.append(tf.Variable(tf.zeros([n_out, n_in + 1])))
        else:
            logstds = [20., 20., 20., 20.]
            print("No prior!")

        model = build_bnn(x, layer_sizes, logstds, self.n_particles)

        def log_joint(bn):
            log_pws = bn.cond_log_prob(w_names)
            log_py_xw = bn.cond_log_prob('y')
            return tf.add_n(log_pws) + tf.reduce_sum(time_decay * log_py_xw, 1)

        model.log_joint = log_joint

        if self.optimizer == 'sgld':
            sgmcmc = zs_sgmcmc.SGLD(learning_rate=self.learning_rate,
                                    add_noise=True)
        elif self.optimizer == 'sghmc':
            sgmcmc = zs_sgmcmc.SGHMC(learning_rate=self.learning_rate,
                                     friction=0.2,
                                     n_iter_resample_v=1000,
                                     second_order=True)
        latent = dict(zip(w_names, wv))
        observed = {'y': y}

        # E step: Sample the parameters
        sample_op, sgmcmc_info = sgmcmc.sample(model,
                                               observed=observed,
                                               latent=latent)

        if update_logstd:
            # M step: Update the logstd hyperparameters
            esti_logstds = [
                0.5 * tf.log(tf.reduce_mean(w * w, axis=0)) for w in wv
            ]
            output_logstds = dict(
                zip(w_names,
                    [0.5 * tf.log(tf.reduce_mean(w * w)) for w in wv]))
            assign_ops = [
                logstds[i].assign(logstd)
                for (i, logstd) in enumerate(esti_logstds)
            ]
            assign_op = tf.group(assign_ops)

        # prediction: rmse & log likelihood
        bn = model.observe(**merge_dicts(latent, observed))
        y_mean = bn["y_mean"]
        self.y_pred = tf.reduce_mean(y_mean, 0)
        self.a_std = np.exp(y_logstd)
        self.e_std = tf.sqrt(tf.reduce_mean((y_mean - self.y_pred)**2, 0))
        self.nll = tf.reduce_mean((self.y_pred - y)**2)

        self.sample_op = sample_op
        self.x = x
        self.y = y
        self.time_decay = time_decay
        if update_logstd:
            self.assign_op = assign_op
        self.update_logstd = update_logstd
def vimco(log_joint, observed, latent, axis=None):
    """
    Implements the multi-sample variance reduced score function estimator for
    gradients of the variational lower bound from (Minh, 2016). This works for
    both continuous and discrete latent `StochasticTensor` s.

    .. note::

        :func:`vimco` is a multi-sample objective, size along `axis` in the
        objective should be larger than 1, else an error is raised.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param axis: The sample dimension to reduce when computing the
        outer expectation in variational lower bound. Must be specified. If
        `None`, an error is raised.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. The variational lower bound.
    """
    warnings.warn(
        "vimco(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. The new vimco gradient estimator can be "
        "accessed by first constructing the importance weighted "
        "objective (using `zs.variational.iw_objective` and then "
        "calling its vimco() method.",
        category=FutureWarning)
    if axis is None:
        raise ValueError("vimco is a multi-sample objective, "
                         "the 'axis' argument must be specified.")

    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    l_signal = log_joint_value + entropy

    # check size along the sample axis
    err_msg = "vimco() is a multi-sample objective, " \
              "size along 'axis' in the objective should be larger than 1."
    if l_signal.get_shape()[axis:axis + 1].is_fully_defined():
        if l_signal.get_shape()[axis].value < 2:
            raise ValueError(err_msg)
    _assert_size_along_axis = tf.assert_greater_equal(tf.shape(l_signal)[axis],
                                                      2,
                                                      message=err_msg)
    with tf.control_dependencies([_assert_size_along_axis]):
        l_signal = tf.identity(l_signal)

    # compute variance reduction term
    mean_except_signal = (
        tf.reduce_sum(l_signal, axis, keep_dims=True) -
        l_signal) / tf.to_float(tf.shape(l_signal)[axis] - 1)
    x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal)

    n_dim = tf.rank(x)
    axis_dim_mask = tf.cast(tf.one_hot(axis, n_dim), tf.bool)
    original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool)
    axis_dim = tf.ones([n_dim], tf.int32) * axis
    originals = tf.ones([n_dim], tf.int32) * (n_dim - 1)
    perm = tf.where(original_mask, axis_dim, tf.range(n_dim))
    perm = tf.where(axis_dim_mask, originals, perm)
    multiples = tf.concat([tf.ones([n_dim], tf.int32), [tf.shape(x)[axis]]], 0)

    x = tf.transpose(x, perm=perm)
    sub_x = tf.transpose(sub_x, perm=perm)
    x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples)
    x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x)
    control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1), perm=perm)

    # variance reduced objective
    l_signal = log_mean_exp(l_signal, axis, keep_dims=True) - control_variate
    fake_term = tf.reduce_sum(-entropy * tf.stop_gradient(l_signal), axis)
    lower_bound = log_mean_exp(log_joint_value + entropy, axis)
    cost = -fake_term - log_mean_exp(log_joint_value + entropy, axis)

    return cost, lower_bound
def nvil(log_joint,
         observed,
         latent,
         baseline=None,
         decay=0.8,
         variance_normalization=False,
         axis=None):
    """
    Implements the variance reduced score function estimator for gradients
    of the variational lower bound from (Mnih, 2014). This algorithm is also
    called "REINFORCE" or "baseline". This works for both continuous and
    discrete latent `StochasticTensor` s.

    :param log_joint: A function that accepts a dictionary argument of
        ``(string, Tensor)`` pairs, which are mappings from all
        `StochasticTensor` names in the model to their observed values. The
        function should return a Tensor, representing the log joint likelihood
        of the model.
    :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from
        names of observed `StochasticTensor` s to their values.
    :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs.
        Mapping from names of latent `StochasticTensor` s to their samples and
        log probabilities.
    :param baseline: A Tensor that can broadcast to match the shape returned
        by `log_joint`. A trainable estimation for the scale of the
        variational lower bound, which is typically dependent on observed
        values, e.g., a neural network with observed values as inputs.
    :param variance_normalization: Whether to use variance normalization.
    :param decay: Float. The moving average decay for variance normalization.
    :param axis: The sample dimension(s) to reduce when computing the
        outer expectation in variational lower bound. If `None`, no dimension
        is reduced.

    :return: A Tensor. The surrogate cost to minimize.
    :return: A Tensor. The variational lower bound.
    """
    warnings.warn(
        "nvil(): This function will be deprecated in the coming "
        "version (0.3.1). Variational utilities are moving to "
        "`zs.variational`. The new nvil gradient estimator can be "
        "accessed by first constructing the elbo objective (using "
        "`zs.variational.elbo` and then calling its reinforce() "
        "method.",
        category=FutureWarning)
    latent_k, latent_v = map(list, zip(*six.iteritems(latent)))
    latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v)))
    latent_logpdfs = map(lambda x: x[1], latent_v)
    joint_obs = merge_dicts(observed, latent_outputs)
    log_joint_value = log_joint(joint_obs)
    entropy = -sum(latent_logpdfs)
    l_signal = log_joint_value + entropy
    cost = 0.

    if baseline is not None:
        baseline_cost = 0.5 * tf.square(tf.stop_gradient(l_signal) - baseline)
        l_signal = l_signal - baseline
        cost += baseline_cost

    if variance_normalization is True:
        # TODO: extend to non-scalar
        bc = tf.reduce_mean(l_signal)
        bv = tf.reduce_mean(tf.square(l_signal - bc))
        moving_mean = tf.get_variable('moving_mean',
                                      shape=[],
                                      initializer=tf.constant_initializer(0.),
                                      trainable=False)
        moving_variance = tf.get_variable(
            'moving_variance',
            shape=[],
            initializer=tf.constant_initializer(1.),
            trainable=False)

        update_mean = moving_averages.assign_moving_average(moving_mean,
                                                            bc,
                                                            decay=decay)
        update_variance = moving_averages.assign_moving_average(
            moving_variance, bv, decay=decay)
        l_signal = (l_signal - moving_mean) / tf.maximum(
            1., tf.sqrt(moving_variance))
        with tf.control_dependencies([update_mean, update_variance]):
            l_signal = tf.identity(l_signal)

    fake_log_joint_cost = -log_joint_value
    fake_variational_cost = tf.stop_gradient(l_signal) * entropy
    cost += fake_log_joint_cost + fake_variational_cost
    lower_bound = log_joint_value + entropy
    if axis is not None:
        cost = tf.reduce_mean(cost, axis)
        lower_bound = tf.reduce_mean(lower_bound, axis)
    return cost, lower_bound
Exemple #18
0
 def _get_log_posterior(var_list, observed):
     joint_obs = merge_dicts(dict(zip(latent_k, var_list)), observed)
     return self._log_joint(joint_obs)
Exemple #19
0
 def get_log_posterior(var_list):
     joint_obs = merge_dicts(dict(zip(latent_k, var_list)), observed)
     log_p = log_joint(joint_obs)
     return log_p
def main():
    tf.set_random_seed(1237)
    np.random.seed(2345)

    # Load UCI protein data
    data_path = os.path.join(conf.data_dir, "protein.data")
    x_train, y_train, x_valid, y_valid, x_test, y_test = \
        dataset.load_uci_protein_data(data_path)
    x_train = np.vstack([x_train, x_valid])
    y_train = np.hstack([y_train, y_valid])
    n_train, x_dim = x_train.shape

    # Standardize data
    x_train, x_test, _, _ = dataset.standardize(x_train, x_test)
    y_train, y_test, mean_y_train, std_y_train = dataset.standardize(
        y_train, y_test)

    # Define model parameters
    n_hiddens = [50]

    # Build the computation graph
    n_particles = 20
    x = tf.placeholder(tf.float32, shape=[None, x_dim])
    y = tf.placeholder(tf.float32, shape=[None])
    layer_sizes = [x_dim] + n_hiddens + [1]
    w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)]
    wv = []
    logstds = []
    for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        wv.append(
            tf.Variable(
                tf.random_uniform([n_particles, n_out, n_in + 1]) * 4 - 2))
        logstds.append(tf.Variable(tf.zeros([n_out, n_in + 1])))

    model = build_bnn(x, layer_sizes, logstds, n_particles)

    def log_joint(bn):
        log_pws = bn.cond_log_prob(w_names)
        log_py_xw = bn.cond_log_prob('y')
        return tf.add_n(log_pws) + tf.reduce_mean(log_py_xw, 1) * n_train

    model.log_joint = log_joint

    # sgmcmc = zs.SGLD(learning_rate=4e-6)
    sgmcmc = zs.SGHMC(learning_rate=2e-6,
                      friction=0.2,
                      n_iter_resample_v=1000,
                      second_order=True)
    # sgmcmc = zs.SGNHT(learning_rate=1e-5, variance_extra=0., tune_rate=50.,
    #                   second_order=True)
    latent = dict(zip(w_names, wv))
    observed = {'y': y}

    # E step: Sample the parameters
    sample_op, sgmcmc_info = sgmcmc.sample(model,
                                           observed=observed,
                                           latent=latent)
    mean_k = sgmcmc_info.mean_k

    # M step: Update the logstd hyperparameters
    esti_logstds = [0.5 * tf.log(tf.reduce_mean(w * w, axis=0)) for w in wv]
    output_logstds = dict(
        zip(w_names, [0.5 * tf.log(tf.reduce_mean(w * w)) for w in wv]))
    assign_ops = [
        logstds[i].assign(logstd) for (i, logstd) in enumerate(esti_logstds)
    ]
    assign_op = tf.group(assign_ops)

    # prediction: rmse & log likelihood
    bn = model.observe(**merge_dicts(latent, observed))
    y_mean = bn["y_mean"]
    y_pred = tf.reduce_mean(y_mean, 0)

    # Define training/evaluation parameters
    epochs = 500
    batch_size = 100
    iters = (n_train - 1) // batch_size + 1

    preds = []
    epochs_ave_pred = 1

    # Run the inference
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(1, epochs + 1):
            perm = np.random.permutation(x_train.shape[0])
            x_train = x_train[perm, :]
            y_train = y_train[perm]
            for t in range(iters):
                x_batch = x_train[t * batch_size:(t + 1) * batch_size]
                y_batch = y_train[t * batch_size:(t + 1) * batch_size]
                _, mean_k_value = sess.run([sample_op, mean_k],
                                           feed_dict={
                                               x: x_batch,
                                               y: y_batch
                                           })
            # print("Epoch {} mean_k = {}".format(epoch, mean_k_value))
            sess.run(assign_op)

            test_pred = sess.run(y_pred, feed_dict={x: x_test})
            preds.append(test_pred)
            pred = np.mean(preds[-epochs_ave_pred:], axis=0)

            test_rmse = np.sqrt(np.mean((pred - y_test)**2)) * std_y_train
            print('>> Epoch {} Test = {} logstds = {}'.format(
                epoch, test_rmse, sess.run(output_logstds)))