def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z0: (batch_size*eq_samples*iw_samples, num_latent) zk: (batch_size*eq_samples*iw_samples, num_latent) z0_mu: (batch_size, num_latent) z0_log_var: (batch_size, num_latent) logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size)) zk = zk.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) for i in range(len(logdet_J_list)): logdet_J_list[i] = logdet_J_list[i].reshape((-1, eq_samples, iw_samples)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z0_mu = z0_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z0_log_var = z0_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3) log_pzk = log_stdnormal(zk).sum(axis=3) log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3) #normalizing flow loss sum_logdet_J = 0 for logdet_J_k in logdet_J_list: sum_logdet_J += logdet_J_k # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp(log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean(LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(log_pzk), T.mean(log_px_given_zk)
def lower_bound(z, z_mu, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): from theano.gradient import disconnected_grad as dg # reshape the variables so batch_size, eq_samples and iw_samples are # separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # prepare x, z for broadcasting # size: (batch_size, eq_samples, iw_samples, num_features) x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) log_qz_given_x = log_bernoulli(z, z_mu, eps=epsilon).sum(axis=3) z_prior = T.ones_like(z)*np.float32(0.5) log_pz = log_bernoulli(z, z_prior).sum(axis=3) log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3) # Calculate the LL using log-sum-exp to avoid underflow log_pxz = log_pz + log_px_given_z # L is (bs, mc) See definition of L in appendix eq. 14 L = log_sum_exp(log_pxz - log_qz_given_x, axis=2) + \ T.log(1.0/T.cast(iw_samples, 'float32')) grads_model = T.grad(-L.mean(), p_params) # L_corr should correspond to equation 10 in the paper L_corr = L.dimshuffle(0, 1, 'x') - get_vimco_baseline( log_pxz - log_qz_given_x) g_lb_inference = T.mean(T.sum(dg(L_corr) * log_qz_given_x) + L) grads_inference = T.grad(-g_lb_inference, q_params) grads = grads_model + grads_inference LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2) return (LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z), grads)
def lower_bound(z, z_mu, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): from theano.gradient import disconnected_grad as dg # reshape the variables so batch_size, eq_samples and iw_samples are # separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # prepare x, z for broadcasting # size: (batch_size, eq_samples, iw_samples, num_features) x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) log_qz_given_x = log_bernoulli(z, z_mu, eps=epsilon).sum(axis=3) z_prior = T.ones_like(z) * np.float32(0.5) log_pz = log_bernoulli(z, z_prior).sum(axis=3) log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3) # Calculate the LL using log-sum-exp to avoid underflow log_pxz = log_pz + log_px_given_z # L is (bs, mc) See definition of L in appendix eq. 14 L = log_sum_exp(log_pxz - log_qz_given_x, axis=2) + \ T.log(1.0/T.cast(iw_samples, 'float32')) grads_model = T.grad(-L.mean(), p_params) # L_corr should correspond to equation 10 in the paper L_corr = L.dimshuffle(0, 1, 'x') - get_vimco_baseline(log_pxz - log_qz_given_x) g_lb_inference = T.mean(T.sum(dg(L_corr) * log_qz_given_x) + L) grads_inference = T.grad(-g_lb_inference, q_params) grads = grads_model + grads_inference LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2) return (LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z), grads)
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z: (batch_size*eq_samples*iw_samples, num_latent) z_mu: (batch_size, num_latent) z_log_var: (batch_size, num_latent) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z = z.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z_mu = z_mu.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3) log_pz = log_stdnormal(z).sum(axis=3) log_px_given_z = log_bernoulli(x, x_mu, epsilon).sum(axis=3) # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean(LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6): """ Latent z : gaussian with standard normal prior decoder output : bernoulli When the output is bernoulli then the output from the decoder should be sigmoid. The sizes of the inputs are z0: (batch_size*eq_samples*iw_samples, num_latent) zk: (batch_size*eq_samples*iw_samples, num_latent) z0_mu: (batch_size, num_latent) z0_log_var: (batch_size, num_latent) logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples) x_mu: (batch_size*eq_samples*iw_samples, num_features) x: (batch_size, num_features) Reference: Burda et al. 2015 "Importance Weighted Autoencoders" """ # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size)) zk = zk.reshape((-1, eq_samples, iw_samples, latent_size)) x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features)) for i in range(len(logdet_J_list)): logdet_J_list[i] = logdet_J_list[i].reshape( (-1, eq_samples, iw_samples)) # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs x = x.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_features) z0_mu = z0_mu.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) z0_log_var = z0_log_var.dimshuffle( 0, 'x', 'x', 1) # size: (batch_size, eq_samples, iw_samples, num_latent) # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately # so we sum over feature/latent dimensions for multivariate pdfs log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3) log_pzk = log_stdnormal(zk).sum(axis=3) log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3) #normalizing flow loss sum_logdet_J = 0 for logdet_J_k in logdet_J_list: sum_logdet_J += logdet_J_k # Calculate the LL using log-sum-exp to avoid underflow all log_*** -> shape: (batch_size, eq_samples, iw_samples) LL = log_mean_exp( log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2 ) # log-mean-exp over iw_samples dimension -> shape: (batch_size, eq_samples) LL = T.mean( LL) # average over eq_samples, batch_size dimensions -> shape: () return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean( log_pzk), T.mean(log_px_given_zk)