Example #1
0
def latent_gaussian_x_bernoulli(z0, zk, z0_mu, z0_log_var, logdet_J_list, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z0: (batch_size*eq_samples*iw_samples, num_latent)
	zk: (batch_size*eq_samples*iw_samples, num_latent)
    z0_mu: (batch_size, num_latent)
    z0_log_var: (batch_size, num_latent)
    logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size))
    zk = zk.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    for i in range(len(logdet_J_list)):
        logdet_J_list[i] = logdet_J_list[i].reshape((-1, eq_samples, iw_samples))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z0_mu = z0_mu.dimshuffle(0, 'x', 'x', 1)            # size: (batch_size, eq_samples, iw_samples, num_latent)
    z0_log_var = z0_log_var.dimshuffle(0, 'x', 'x', 1)  # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3)
    log_pzk = log_stdnormal(zk).sum(axis=3)
    log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    #normalizing flow loss
    sum_logdet_J = 0
    for logdet_J_k in logdet_J_list:
        sum_logdet_J += logdet_J_k

    # Calculate the LL using log-sum-exp to avoid underflow                                       all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2)      # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(LL)                                                                             # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(log_pzk), T.mean(log_px_given_zk)
Example #2
0
def lower_bound(z, z_mu, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    from theano.gradient import disconnected_grad as dg
    # reshape the variables so batch_size, eq_samples and iw_samples are
    # separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # prepare x, z for broadcasting
    # size: (batch_size, eq_samples, iw_samples, num_features)
    x = x.dimshuffle(0, 'x', 'x', 1)

    # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)

    log_qz_given_x = log_bernoulli(z, z_mu, eps=epsilon).sum(axis=3)
    z_prior = T.ones_like(z)*np.float32(0.5)
    log_pz = log_bernoulli(z, z_prior).sum(axis=3)
    log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3)

    # Calculate the LL using log-sum-exp to avoid underflow
    log_pxz = log_pz + log_px_given_z

    # L is (bs, mc) See definition of L in appendix eq. 14
    L = log_sum_exp(log_pxz - log_qz_given_x, axis=2) + \
        T.log(1.0/T.cast(iw_samples, 'float32'))

    grads_model = T.grad(-L.mean(), p_params)

    # L_corr should correspond to equation 10 in the paper
    L_corr = L.dimshuffle(0, 1, 'x') - get_vimco_baseline(
        log_pxz - log_qz_given_x)
    g_lb_inference = T.mean(T.sum(dg(L_corr) * log_qz_given_x) + L)
    grads_inference = T.grad(-g_lb_inference, q_params)

    grads = grads_model + grads_inference
    LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2)
    return (LL,
            T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z),
            grads)
def lower_bound(z, z_mu, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    from theano.gradient import disconnected_grad as dg
    # reshape the variables so batch_size, eq_samples and iw_samples are
    # separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # prepare x, z for broadcasting
    # size: (batch_size, eq_samples, iw_samples, num_features)
    x = x.dimshuffle(0, 'x', 'x', 1)

    # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)

    log_qz_given_x = log_bernoulli(z, z_mu, eps=epsilon).sum(axis=3)
    z_prior = T.ones_like(z) * np.float32(0.5)
    log_pz = log_bernoulli(z, z_prior).sum(axis=3)
    log_px_given_z = log_bernoulli(x, x_mu, eps=epsilon).sum(axis=3)

    # Calculate the LL using log-sum-exp to avoid underflow
    log_pxz = log_pz + log_px_given_z

    # L is (bs, mc) See definition of L in appendix eq. 14
    L = log_sum_exp(log_pxz - log_qz_given_x, axis=2) + \
        T.log(1.0/T.cast(iw_samples, 'float32'))

    grads_model = T.grad(-L.mean(), p_params)

    # L_corr should correspond to equation 10 in the paper
    L_corr = L.dimshuffle(0, 1,
                          'x') - get_vimco_baseline(log_pxz - log_qz_given_x)
    g_lb_inference = T.mean(T.sum(dg(L_corr) * log_qz_given_x) + L)
    grads_inference = T.grad(-g_lb_inference, q_params)

    grads = grads_model + grads_inference
    LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2)
    return (LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z),
            grads)
Example #4
0
def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, eq_samples, iw_samples, epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z: (batch_size*eq_samples*iw_samples, num_latent)
    z_mu: (batch_size, num_latent)
    z_log_var: (batch_size, num_latent)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z = z.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
    z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
    z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
    log_pz = log_stdnormal(z).sum(axis=3)
    log_px_given_z = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    # Calculate the LL using log-sum-exp to avoid underflow                   all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(log_pz + log_px_given_z - log_qz_given_x, axis=2)     # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(LL)                                                         # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
Example #5
0
def latent_gaussian_x_bernoulli(z0,
                                zk,
                                z0_mu,
                                z0_log_var,
                                logdet_J_list,
                                x_mu,
                                x,
                                eq_samples,
                                iw_samples,
                                epsilon=1e-6):
    """
    Latent z       : gaussian with standard normal prior
    decoder output : bernoulli

    When the output is bernoulli then the output from the decoder
    should be sigmoid. The sizes of the inputs are
    z0: (batch_size*eq_samples*iw_samples, num_latent)
	zk: (batch_size*eq_samples*iw_samples, num_latent)
    z0_mu: (batch_size, num_latent)
    z0_log_var: (batch_size, num_latent)
    logdet_J_list: list of `nflows` elements, each with shape (batch_size*eq_samples*iw_samples)
    x_mu: (batch_size*eq_samples*iw_samples, num_features)
    x: (batch_size, num_features)

    Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
    """

    # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
    z0 = z0.reshape((-1, eq_samples, iw_samples, latent_size))
    zk = zk.reshape((-1, eq_samples, iw_samples, latent_size))
    x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))

    for i in range(len(logdet_J_list)):
        logdet_J_list[i] = logdet_J_list[i].reshape(
            (-1, eq_samples, iw_samples))

    # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
    x = x.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_features)
    z0_mu = z0_mu.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_latent)
    z0_log_var = z0_log_var.dimshuffle(
        0, 'x', 'x',
        1)  # size: (batch_size, eq_samples, iw_samples, num_latent)

    # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately
    # so we sum over feature/latent dimensions for multivariate pdfs
    log_q0z0_given_x = log_normal2(z0, z0_mu, z0_log_var).sum(axis=3)
    log_pzk = log_stdnormal(zk).sum(axis=3)
    log_px_given_zk = log_bernoulli(x, x_mu, epsilon).sum(axis=3)

    #normalizing flow loss
    sum_logdet_J = 0
    for logdet_J_k in logdet_J_list:
        sum_logdet_J += logdet_J_k

    # Calculate the LL using log-sum-exp to avoid underflow                                       all log_***                                       -> shape: (batch_size, eq_samples, iw_samples)
    LL = log_mean_exp(
        log_pzk + log_px_given_zk - log_q0z0_given_x + sum_logdet_J, axis=2
    )  # log-mean-exp over iw_samples dimension            -> shape: (batch_size, eq_samples)
    LL = T.mean(
        LL)  # average over eq_samples, batch_size dimensions    -> shape: ()

    return LL, T.mean(log_q0z0_given_x), T.mean(sum_logdet_J), T.mean(
        log_pzk), T.mean(log_px_given_zk)