def is_loglikelihood(log_joint, observed, latent, axis=None): """ Marginal log likelihood (:math:`\log p(x)`) estimates using self-normalized importance sampling. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in the importance sampling estimation. If None, no dimension is reduced. :return: A Tensor. The estimated log likelihood of observed data. """ latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_w = log_joint(joint_obs) - sum(latent_logpdfs) if axis is not None: return log_mean_exp(log_w, axis) return log_w
def __init__(self, log_prior, log_joint, prior_sampler, hmc, observed, latent, n_chains, n_temperatures): # Shape of latent: [chain_axis * num_data, data dims] # Construct the tempered objective self.prior_sampler = prior_sampler self.latent = latent self.n_chains = n_chains self.n_temperatures = n_temperatures with tf.name_scope("BDMC"): self.temperature = tf.placeholder(tf.float32, shape=[], name="temperature") def log_fn(observed): return log_prior(observed) * (1 - self.temperature) + \ log_joint(observed) * self.temperature self.log_fn = log_fn self.log_fn_val = log_fn(merge_dicts(observed, latent)) self.sample_op, self.hmc_info = hmc.sample(log_fn, observed, latent) self.init_latent = [ tf.assign(z, z_s) for z, z_s in zip(latent.values(), self.prior_sampler.values()) ]
def _log_joint_term(self): if self._meta_bn: return self.bn.log_joint() elif not hasattr(self, '_log_joint_cache'): self._log_joint_cache = self._log_joint( merge_dicts(self._v_inputs, self._observed)) return self._log_joint_cache
def sgvb(log_joint, observed, latent, axis=None): """ Implements the stochastic gradient variational bayes (SGVB) algorithm from (Kingma, 2013). This only works for continuous latent `StochasticTensor` s that can be reparameterized (Kingma, 2013). :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in variational lower bound. If `None`, no dimension is reduced. :return: A Tensor. The variational lower bound. """ latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) lower_bound = log_joint(joint_obs) - sum(latent_logpdfs) if axis is not None: lower_bound = tf.reduce_mean(lower_bound, axis) return lower_bound
def __init__(self, meta_bn, proposal_meta_bn, hmc, observed, latent, n_temperatures=1000, n_adapt=30, verbose=False): # Shape of latent: [chain_axis, num_data, data dims] # Construct the tempered objective self._n_temperatures = n_temperatures self._n_adapt = n_adapt self._verbose = verbose with tf.name_scope("AIS"): if callable(meta_bn): log_joint = meta_bn else: log_joint = lambda obs: meta_bn.observe(**obs).log_joint() latent_k, latent_v = zip(*six.iteritems(latent)) prior_samples = proposal_meta_bn.observe().get(latent_k) log_prior = lambda obs: proposal_meta_bn.observe(**obs).log_joint() self.temperature = tf.placeholder(tf.float32, shape=[], name="temperature") def log_fn(observed): return log_prior(observed) * (1 - self.temperature) + \ log_joint(observed) * self.temperature self.log_fn = log_fn self.log_fn_val = log_fn(merge_dicts(observed, latent)) self.sample_op, self.hmc_info = hmc.sample( log_fn, observed, latent) self.init_latent = [tf.assign(z, z_s) for z, z_s in zip(latent_v, prior_samples)]
def run(self, sess, feed_dict): """ Run the AIS loop. :param sess: A Tensorflow session. :param feed_dict: The `feed_dict` argument for ``session.run``. :return: The log marginal likelihood estimate. """ # Help adapt the hmc size adp_num_t = 2 if self._n_temperatures > 1 else 1 adp_t = self._get_schedule_t(adp_num_t) sess.run(self.init_latent, feed_dict=feed_dict) for i in range(self._n_adapt): _, acc = sess.run([self.sample_op, self.hmc_info.acceptance_rate], feed_dict=merge_dicts(feed_dict, {self.temperature: adp_t})) if self._verbose: print('Adapt iter {}, acc = {:.3f}'.format(i, np.mean(acc))) # Draw a sample from the prior sess.run(self.init_latent, feed_dict=feed_dict) prior_density = sess.run(self.log_fn_val, feed_dict=merge_dicts( feed_dict, {self.temperature: 0})) log_weights = -prior_density for num_t in range(self._n_temperatures): # current_temperature = 1.0 / self._n_temperatures * (num_t + 1) current_temperature = self._get_schedule_t(num_t + 1) _, old_log_p, new_log_p, acc = sess.run( [self.sample_op, self.hmc_info.orig_log_prob, self.hmc_info.log_prob, self.hmc_info.acceptance_rate], feed_dict=merge_dicts(feed_dict, {self.temperature: current_temperature})) if num_t + 1 < self._n_temperatures: log_weights += old_log_p - new_log_p else: log_weights += old_log_p if self._verbose: print('Finished step {}, Temperature = {:.4f}, acc = {:.3f}' .format(num_t + 1, current_temperature, np.mean(acc))) return np.mean(self._get_lower_bound(log_weights))
def run(self, sess, feed_dict): # Draw a sample from the prior sess.run(self.init_latent, feed_dict=feed_dict) prior_density = sess.run(self.log_fn_val, feed_dict=merge_dicts(feed_dict, {self.temperature: 0})) log_weights = -prior_density # Forward AIS for num_t in range(self.n_temperatures): current_temperature = 1.0 / self.n_temperatures * (num_t + 1) new_feed_dict = feed_dict.copy() new_feed_dict[self.temperature] = current_temperature _, old_log_p, new_log_p = sess.run([ self.sample_op, self.hmc_info.orig_log_prob, self.hmc_info.log_prob ], feed_dict=new_feed_dict) if num_t + 1 < self.n_temperatures: log_weights += old_log_p - new_log_p else: log_weights += old_log_p ll_lb = np.mean(self.get_lower_bound(log_weights)) # Backward AIS log_weights = -new_log_p for num_t in range(self.n_temperatures): current_temperature = 1.0 - 1.0 / self.n_temperatures * (num_t + 1) _, old_log_p, new_log_p = sess.run( [ self.sample_op, self.hmc_info.orig_log_prob, self.hmc_info.log_prob ], feed_dict=merge_dicts(feed_dict, {self.temperature: current_temperature})) if num_t + 1 < self.n_temperatures: log_weights += old_log_p - new_log_p else: log_weights += old_log_p ll_ub = -np.mean(self.get_lower_bound(log_weights)) return ll_lb, ll_ub
def run(self, sess, feed_dict): # Help adapt the hmc size n_adp = 30 adp_num_t = 2 if self.n_temperatures > 1 else 1 adp_t = self.get_schedule_t(adp_num_t) sess.run(self.init_latent, feed_dict=feed_dict) for i in range(n_adp): _, acc = sess.run([self.sample_op, self.hmc_info.acceptance_rate], feed_dict=merge_dicts(feed_dict, {self.temperature: adp_t})) if self.verbose: print('Adapt iter {}, acc = {:.3f}'.format(i, np.mean(acc))) # Draw a sample from the prior sess.run(self.init_latent, feed_dict=feed_dict) prior_density = sess.run(self.log_fn_val, feed_dict=merge_dicts(feed_dict, {self.temperature: 0})) log_weights = -prior_density for num_t in range(self.n_temperatures): # current_temperature = 1.0 / self.n_temperatures * (num_t + 1) current_temperature = self.get_schedule_t(num_t + 1) _, old_log_p, new_log_p, acc = sess.run( [ self.sample_op, self.hmc_info.orig_log_prob, self.hmc_info.log_prob, self.hmc_info.acceptance_rate ], feed_dict=merge_dicts(feed_dict, {self.temperature: current_temperature})) if num_t + 1 < self.n_temperatures: log_weights += old_log_p - new_log_p else: log_weights += old_log_p if self.verbose: print('Finished step {}, Temperature = {:.4f}, acc = {:.3f}'. format(num_t + 1, current_temperature, np.mean(acc))) return np.mean(self.get_lower_bound(log_weights))
def rws(log_joint, observed, latent, axis=None): """ Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for both continuous and discrete latent `StochasticTensor` s. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in log likelihood and in the cost for adapting proposals. If `None`, no dimension is reduced. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. Estimated log likelihoods. """ warnings.warn( "rws(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. Features of the original rws() can be " "achieved by two new variational objectives. For learning " "model parameters, please use the importance weighted " "objective: `zs.variational.iw_objective()`. For adapting " "the proposal, the new rws gradient estimator can be " "accessed by first constructing the inclusive KL divergence " "objective using `zs.variational.klpq` and then calling " "its rws() method.", category=FutureWarning) latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) log_w = log_joint_value + entropy if axis is not None: log_w_max = tf.reduce_max(log_w, axis, keep_dims=True) w_u = tf.exp(log_w - log_w_max) w_tilde = tf.stop_gradient(w_u / tf.reduce_sum(w_u, axis, keep_dims=True)) log_likelihood = log_mean_exp(log_w, axis) fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis) fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis) cost = fake_log_joint_cost + fake_proposal_cost else: cost = log_w log_likelihood = log_w return cost, log_likelihood
def __init__(self, log_joint, observed, latent): self._log_joint = log_joint self._observed = observed self._latent = latent # TODO: Add input name check by matching them latent_k, latent_v = map(list, zip(*six.iteritems(latent))) self._latent_outputs = dict( zip(latent_k, map(lambda x: x[0], latent_v))) self._latent_logpdfs = dict( zip(latent_k, map(lambda x: x[1], latent_v))) self._joint_obs = merge_dicts(observed, self._latent_outputs) try: self._dict_key = (log_joint, frozenset(latent_k), frozenset(map(tuple, latent_v)), frozenset(six.iteritems(observed))) except TypeError: # Unhashable type self._dict_key = None
def rws(log_joint, observed, latent, axis=None): """ Implements Reweighted Wake-sleep from (Bornschein, 2015). This works for both continuous and discrete latent `StochasticTensor` s. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in log likelihood and in the cost for adapting proposals. If `None`, no dimension is reduced. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. Estimated log likelihoods. """ latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) log_w = log_joint_value + entropy if axis is not None: log_w_max = tf.reduce_max(log_w, axis, keep_dims=True) w_u = tf.exp(log_w - log_w_max) w_tilde = tf.stop_gradient(w_u / tf.reduce_sum(w_u, axis, keep_dims=True)) log_likelihood = log_mean_exp(log_w, axis) fake_log_joint_cost = -tf.reduce_sum(w_tilde * log_joint_value, axis) fake_proposal_cost = tf.reduce_sum(w_tilde * entropy, axis) cost = fake_log_joint_cost + fake_proposal_cost else: cost = log_w log_likelihood = log_w return cost, log_likelihood
def log_joint(observed): log_pz = [] log_ph_z = [] log_px_h = None for i in range(n_z): z_obs_onehot = tf.one_hot( i * tf.ones([n_particles, batch_size], dtype=tf.int32), depth=n_z, dtype=tf.int32) # (n_particles, batch_size, n_z) ob_dict = merge_dicts( observed, {'z': z_obs_onehot}) # sum over all possible z model, _, _ = vae(ob_dict, n, n_x, n_h, n_z, n_particles) log_pz_i, log_ph_z_i, log_px_h = model.local_log_prob( ['z', 'h', 'x']) log_pz.append(log_pz_i) log_ph_z.append(log_ph_z_i) log_pz = tf.stack(log_pz, axis=-1) log_ph_z = tf.stack(log_ph_z, axis=-1) # P(X,H) = P(X|H) * sum[(P(H|z_i) * P(z_i))] return log_px_h + tf.reduce_logsumexp(log_pz + log_ph_z, axis=-1)
def bn(self): """ The :class:`~zhusuan.framework.bn.BayesianNet` constructed by observing the :attr:`meta_bn` with samples from the variational posterior distributions. ``None`` if the log joint probability function is provided instead of :attr:`meta_bn`. .. note:: This :class:`~zhusuan.framework.bn.BayesianNet` instance is useful when computing predictions with the approximate posterior distribution. """ if self._meta_bn: if not hasattr(self, "_bn"): self._bn = self._meta_bn.observe( **merge_dicts(self._v_inputs, self._observed)) self._validate_variational_inputs(self._bn) return self._bn else: return None
def sgvb(log_joint, observed, latent, axis=None): """ Implements the stochastic gradient variational bayes (SGVB) algorithm from (Kingma, 2013). This only works for continuous latent `StochasticTensor` s that can be reparameterized (Kingma, 2013). :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values :param latent: A dictionary of ``(string, (Tensor, Tensor))`` pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension(s) to reduce when computing the outer expectation in variational lower bound. If `None`, no dimension is reduced. :return: A Tensor. The variational lower bound. """ warnings.warn( "sgvb(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. The new sgvb gradient estimator can be " "accessed by first constructing the elbo objective (using " "`zs.variational.elbo` and then calling its sgvb() method.", category=FutureWarning) latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) lower_bound = log_joint(joint_obs) - sum(latent_logpdfs) if axis is not None: lower_bound = tf.reduce_mean(lower_bound, axis) return lower_bound
def _define_model(self, x_dim): y_logstd = -1. @zs.meta_bayesian_net(scope="bnn", reuse_variables=True) def build_bnn(x, layer_sizes, logstds, n_particles): bn = zs.BayesianNet() h = tf.tile(x[None, ...], [n_particles, 1, 1]) for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w = bn.normal("w" + str(i), tf.zeros([n_out, n_in + 1]), logstd=logstds[i], group_ndims=2, n_samples=n_particles) h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1) h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt( tf.cast(tf.shape(h)[2], tf.float32)) if i < len(layer_sizes) - 2: h = tf.nn.relu(h) y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2)) bn.normal("y", y_mean, logstd=y_logstd) return bn update_logstd = self.update_logstd x = tf.placeholder(tf.float32, shape=[None, x_dim]) self.x = x y = tf.placeholder(tf.float32, shape=[None]) time_decay = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [x_dim] + self.hidden_layer_sizes + [1] w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)] wv = [] logstds = [] for n_in, n_out in zip(layer_sizes[:-1], layer_sizes[1:]): wv.append( tf.Variable( tf.random_uniform([self.n_particles, n_out, n_in + 1]) * 4 - 2)) if update_logstd: for n_in, n_out in zip(layer_sizes[:-1], layer_sizes[1:]): logstds.append(tf.Variable(tf.zeros([n_out, n_in + 1]))) else: logstds = [20., 20., 20., 20.] print("No prior!") model = build_bnn(x, layer_sizes, logstds, self.n_particles) def log_joint(bn): log_pws = bn.cond_log_prob(w_names) log_py_xw = bn.cond_log_prob('y') return tf.add_n(log_pws) + tf.reduce_sum(time_decay * log_py_xw, 1) model.log_joint = log_joint if self.optimizer == 'sgld': sgmcmc = zs_sgmcmc.SGLD(learning_rate=self.learning_rate, add_noise=True) elif self.optimizer == 'sghmc': sgmcmc = zs_sgmcmc.SGHMC(learning_rate=self.learning_rate, friction=0.2, n_iter_resample_v=1000, second_order=True) latent = dict(zip(w_names, wv)) observed = {'y': y} # E step: Sample the parameters sample_op, sgmcmc_info = sgmcmc.sample(model, observed=observed, latent=latent) if update_logstd: # M step: Update the logstd hyperparameters esti_logstds = [ 0.5 * tf.log(tf.reduce_mean(w * w, axis=0)) for w in wv ] output_logstds = dict( zip(w_names, [0.5 * tf.log(tf.reduce_mean(w * w)) for w in wv])) assign_ops = [ logstds[i].assign(logstd) for (i, logstd) in enumerate(esti_logstds) ] assign_op = tf.group(assign_ops) # prediction: rmse & log likelihood bn = model.observe(**merge_dicts(latent, observed)) y_mean = bn["y_mean"] self.y_pred = tf.reduce_mean(y_mean, 0) self.a_std = np.exp(y_logstd) self.e_std = tf.sqrt(tf.reduce_mean((y_mean - self.y_pred)**2, 0)) self.nll = tf.reduce_mean((self.y_pred - y)**2) self.sample_op = sample_op self.x = x self.y = y self.time_decay = time_decay if update_logstd: self.assign_op = assign_op self.update_logstd = update_logstd
def vimco(log_joint, observed, latent, axis=None): """ Implements the multi-sample variance reduced score function estimator for gradients of the variational lower bound from (Minh, 2016). This works for both continuous and discrete latent `StochasticTensor` s. .. note:: :func:`vimco` is a multi-sample objective, size along `axis` in the objective should be larger than 1, else an error is raised. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param axis: The sample dimension to reduce when computing the outer expectation in variational lower bound. Must be specified. If `None`, an error is raised. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. The variational lower bound. """ warnings.warn( "vimco(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. The new vimco gradient estimator can be " "accessed by first constructing the importance weighted " "objective (using `zs.variational.iw_objective` and then " "calling its vimco() method.", category=FutureWarning) if axis is None: raise ValueError("vimco is a multi-sample objective, " "the 'axis' argument must be specified.") latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) l_signal = log_joint_value + entropy # check size along the sample axis err_msg = "vimco() is a multi-sample objective, " \ "size along 'axis' in the objective should be larger than 1." if l_signal.get_shape()[axis:axis + 1].is_fully_defined(): if l_signal.get_shape()[axis].value < 2: raise ValueError(err_msg) _assert_size_along_axis = tf.assert_greater_equal(tf.shape(l_signal)[axis], 2, message=err_msg) with tf.control_dependencies([_assert_size_along_axis]): l_signal = tf.identity(l_signal) # compute variance reduction term mean_except_signal = ( tf.reduce_sum(l_signal, axis, keep_dims=True) - l_signal) / tf.to_float(tf.shape(l_signal)[axis] - 1) x, sub_x = tf.to_float(l_signal), tf.to_float(mean_except_signal) n_dim = tf.rank(x) axis_dim_mask = tf.cast(tf.one_hot(axis, n_dim), tf.bool) original_mask = tf.cast(tf.one_hot(n_dim - 1, n_dim), tf.bool) axis_dim = tf.ones([n_dim], tf.int32) * axis originals = tf.ones([n_dim], tf.int32) * (n_dim - 1) perm = tf.where(original_mask, axis_dim, tf.range(n_dim)) perm = tf.where(axis_dim_mask, originals, perm) multiples = tf.concat([tf.ones([n_dim], tf.int32), [tf.shape(x)[axis]]], 0) x = tf.transpose(x, perm=perm) sub_x = tf.transpose(sub_x, perm=perm) x_ex = tf.tile(tf.expand_dims(x, n_dim), multiples) x_ex = x_ex - tf.matrix_diag(x) + tf.matrix_diag(sub_x) control_variate = tf.transpose(log_mean_exp(x_ex, n_dim - 1), perm=perm) # variance reduced objective l_signal = log_mean_exp(l_signal, axis, keep_dims=True) - control_variate fake_term = tf.reduce_sum(-entropy * tf.stop_gradient(l_signal), axis) lower_bound = log_mean_exp(log_joint_value + entropy, axis) cost = -fake_term - log_mean_exp(log_joint_value + entropy, axis) return cost, lower_bound
def nvil(log_joint, observed, latent, baseline=None, decay=0.8, variance_normalization=False, axis=None): """ Implements the variance reduced score function estimator for gradients of the variational lower bound from (Mnih, 2014). This algorithm is also called "REINFORCE" or "baseline". This works for both continuous and discrete latent `StochasticTensor` s. :param log_joint: A function that accepts a dictionary argument of ``(string, Tensor)`` pairs, which are mappings from all `StochasticTensor` names in the model to their observed values. The function should return a Tensor, representing the log joint likelihood of the model. :param observed: A dictionary of ``(string, Tensor)`` pairs. Mapping from names of observed `StochasticTensor` s to their values. :param latent: A dictionary of ``(string, (Tensor, Tensor))``) pairs. Mapping from names of latent `StochasticTensor` s to their samples and log probabilities. :param baseline: A Tensor that can broadcast to match the shape returned by `log_joint`. A trainable estimation for the scale of the variational lower bound, which is typically dependent on observed values, e.g., a neural network with observed values as inputs. :param variance_normalization: Whether to use variance normalization. :param decay: Float. The moving average decay for variance normalization. :param axis: The sample dimension(s) to reduce when computing the outer expectation in variational lower bound. If `None`, no dimension is reduced. :return: A Tensor. The surrogate cost to minimize. :return: A Tensor. The variational lower bound. """ warnings.warn( "nvil(): This function will be deprecated in the coming " "version (0.3.1). Variational utilities are moving to " "`zs.variational`. The new nvil gradient estimator can be " "accessed by first constructing the elbo objective (using " "`zs.variational.elbo` and then calling its reinforce() " "method.", category=FutureWarning) latent_k, latent_v = map(list, zip(*six.iteritems(latent))) latent_outputs = dict(zip(latent_k, map(lambda x: x[0], latent_v))) latent_logpdfs = map(lambda x: x[1], latent_v) joint_obs = merge_dicts(observed, latent_outputs) log_joint_value = log_joint(joint_obs) entropy = -sum(latent_logpdfs) l_signal = log_joint_value + entropy cost = 0. if baseline is not None: baseline_cost = 0.5 * tf.square(tf.stop_gradient(l_signal) - baseline) l_signal = l_signal - baseline cost += baseline_cost if variance_normalization is True: # TODO: extend to non-scalar bc = tf.reduce_mean(l_signal) bv = tf.reduce_mean(tf.square(l_signal - bc)) moving_mean = tf.get_variable('moving_mean', shape=[], initializer=tf.constant_initializer(0.), trainable=False) moving_variance = tf.get_variable( 'moving_variance', shape=[], initializer=tf.constant_initializer(1.), trainable=False) update_mean = moving_averages.assign_moving_average(moving_mean, bc, decay=decay) update_variance = moving_averages.assign_moving_average( moving_variance, bv, decay=decay) l_signal = (l_signal - moving_mean) / tf.maximum( 1., tf.sqrt(moving_variance)) with tf.control_dependencies([update_mean, update_variance]): l_signal = tf.identity(l_signal) fake_log_joint_cost = -log_joint_value fake_variational_cost = tf.stop_gradient(l_signal) * entropy cost += fake_log_joint_cost + fake_variational_cost lower_bound = log_joint_value + entropy if axis is not None: cost = tf.reduce_mean(cost, axis) lower_bound = tf.reduce_mean(lower_bound, axis) return cost, lower_bound
def _get_log_posterior(var_list, observed): joint_obs = merge_dicts(dict(zip(latent_k, var_list)), observed) return self._log_joint(joint_obs)
def get_log_posterior(var_list): joint_obs = merge_dicts(dict(zip(latent_k, var_list)), observed) log_p = log_joint(joint_obs) return log_p
def main(): tf.set_random_seed(1237) np.random.seed(2345) # Load UCI protein data data_path = os.path.join(conf.data_dir, "protein.data") x_train, y_train, x_valid, y_valid, x_test, y_test = \ dataset.load_uci_protein_data(data_path) x_train = np.vstack([x_train, x_valid]) y_train = np.hstack([y_train, y_valid]) n_train, x_dim = x_train.shape # Standardize data x_train, x_test, _, _ = dataset.standardize(x_train, x_test) y_train, y_test, mean_y_train, std_y_train = dataset.standardize( y_train, y_test) # Define model parameters n_hiddens = [50] # Build the computation graph n_particles = 20 x = tf.placeholder(tf.float32, shape=[None, x_dim]) y = tf.placeholder(tf.float32, shape=[None]) layer_sizes = [x_dim] + n_hiddens + [1] w_names = ["w" + str(i) for i in range(len(layer_sizes) - 1)] wv = [] logstds = [] for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): wv.append( tf.Variable( tf.random_uniform([n_particles, n_out, n_in + 1]) * 4 - 2)) logstds.append(tf.Variable(tf.zeros([n_out, n_in + 1]))) model = build_bnn(x, layer_sizes, logstds, n_particles) def log_joint(bn): log_pws = bn.cond_log_prob(w_names) log_py_xw = bn.cond_log_prob('y') return tf.add_n(log_pws) + tf.reduce_mean(log_py_xw, 1) * n_train model.log_joint = log_joint # sgmcmc = zs.SGLD(learning_rate=4e-6) sgmcmc = zs.SGHMC(learning_rate=2e-6, friction=0.2, n_iter_resample_v=1000, second_order=True) # sgmcmc = zs.SGNHT(learning_rate=1e-5, variance_extra=0., tune_rate=50., # second_order=True) latent = dict(zip(w_names, wv)) observed = {'y': y} # E step: Sample the parameters sample_op, sgmcmc_info = sgmcmc.sample(model, observed=observed, latent=latent) mean_k = sgmcmc_info.mean_k # M step: Update the logstd hyperparameters esti_logstds = [0.5 * tf.log(tf.reduce_mean(w * w, axis=0)) for w in wv] output_logstds = dict( zip(w_names, [0.5 * tf.log(tf.reduce_mean(w * w)) for w in wv])) assign_ops = [ logstds[i].assign(logstd) for (i, logstd) in enumerate(esti_logstds) ] assign_op = tf.group(assign_ops) # prediction: rmse & log likelihood bn = model.observe(**merge_dicts(latent, observed)) y_mean = bn["y_mean"] y_pred = tf.reduce_mean(y_mean, 0) # Define training/evaluation parameters epochs = 500 batch_size = 100 iters = (n_train - 1) // batch_size + 1 preds = [] epochs_ave_pred = 1 # Run the inference with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, epochs + 1): perm = np.random.permutation(x_train.shape[0]) x_train = x_train[perm, :] y_train = y_train[perm] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] y_batch = y_train[t * batch_size:(t + 1) * batch_size] _, mean_k_value = sess.run([sample_op, mean_k], feed_dict={ x: x_batch, y: y_batch }) # print("Epoch {} mean_k = {}".format(epoch, mean_k_value)) sess.run(assign_op) test_pred = sess.run(y_pred, feed_dict={x: x_test}) preds.append(test_pred) pred = np.mean(preds[-epochs_ave_pred:], axis=0) test_rmse = np.sqrt(np.mean((pred - y_test)**2)) * std_y_train print('>> Epoch {} Test = {} logstds = {}'.format( epoch, test_rmse, sess.run(output_logstds)))