Example #1
0
    def log_p_y_z(self):

        if self.continuous:
            h_decoder = softplus(dot(self.W_zh, self.z.T) + self.b_zh)
            if self.numHiddenLayers_decoder == 2:
                h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh)
            mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1
            log_sigma_decoder = 0.5 * (dot(self.W_hy2, h_decoder) + self.b_hy2)
            log_pyz    = T.sum( -(0.5 * np.log(2 * np.pi) + log_sigma_decoder) \
                                - 0.5 * ((self.y_miniBatch.T - mu_decoder) / T.exp(log_sigma_decoder))**2 )

            log_sigma_decoder.name = 'log_sigma_decoder'
            mu_decoder.name = 'mu_decoder'
            h_decoder.name = 'h_decoder'
            log_pyz.name = 'log_p_y_z'
        else:
            h_decoder = tanh(dot(self.W_zh, self.z) + self.b_zh)
            if self.numHiddenLayers_decoder == 2:
                h_decoder = softplus(dot(W_hh, h_decoder) + self.b_hh)
            y_hat = sigmoid(dot(self.W_hy1, h_decoder) + self.b_hy1)
            log_pyz = -T.nnet.binary_crossentropy(y_hat,
                                                  self.y_miniBatch).sum()
            h_decoder.name = 'h_decoder'
            y_hat.name = 'y_hat'
            log_pyz.name = 'log_p_y_z'

        return log_pyz
    def log_p_y_z(self):

        if self.continuous:
            h_decoder  = softplus(dot(self.W_zh,self.z.T) + self.b_zh)
            if self.numHiddenLayers_decoder == 2:
                h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh)
            mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1
            log_sigma_decoder = 0.5*(dot(self.W_hy2, h_decoder) + self.b_hy2)
            log_pyz    = T.sum( -(0.5 * np.log(2 * np.pi) + log_sigma_decoder) \
                                - 0.5 * ((self.y_miniBatch.T - mu_decoder) / T.exp(log_sigma_decoder))**2 )

            log_sigma_decoder.name = 'log_sigma_decoder'
            mu_decoder.name        = 'mu_decoder'
            h_decoder.name         = 'h_decoder'
            log_pyz.name           = 'log_p_y_z'
        else:
            h_decoder = tanh(dot(self.W_zh, self.z) + self.b_zh)
            if self.numHiddenLayers_decoder == 2:
                h_decoder = softplus(dot(W_hh, h_decoder) + self.b_hh)
            y_hat     = sigmoid(dot(self.W_hy1, h_decoder) + self.b_hy1)
            log_pyz   = -T.nnet.binary_crossentropy(y_hat, self.y_miniBatch).sum()
            h_decoder.name = 'h_decoder'
            y_hat.name     = 'y_hat'
            log_pyz.name   = 'log_p_y_z'

        return log_pyz
    def create_new_data_function(self):
        # self.z_test = sharedZeroMatrix(self.Q,1,'z_test')
        h_decoder  = softplus(dot(self.W_zh,self.z_test.T) + self.b_zh)
        if self.numHiddenLayers_decoder == 2:
            h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh)
        mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1
        self.new_data_function = th.function([], mu_decoder, no_default_updates=True)

        return mu_decoder
 def forward_propagation(self):
     # layer 1 -> layer 2
     self.z1 = self.linear_line_with_self(self.x, self.w1, self.b1)
     self.z2 = self.linear_line_with_self(self.x, self.w2, self.b2)
     self.a1 = softplus(self.z1)
     self.a2 = softplus(self.z2)
     # layer 2 -> layer 3
     self.z3 = self.linear_line_with_self(self.a1, self.w3, 0)
     self.z4 = self.linear_line_with_self(self.a2, self.w4, 0)
     self.h = self.z3 + self.z4 + self.b3
Example #5
0
    def create_new_data_function(self):
        # self.z_test = sharedZeroMatrix(self.Q,1,'z_test')
        h_decoder = softplus(dot(self.W_zh, self.z_test.T) + self.b_zh)
        if self.numHiddenLayers_decoder == 2:
            h_decoder = softplus(dot(self.W_hh, h_decoder) + self.b_hh)
        mu_decoder = dot(self.W_hy1, h_decoder) + self.b_hy1
        self.new_data_function = th.function([],
                                             mu_decoder,
                                             no_default_updates=True)

        return mu_decoder
def forward_propagation(Teta):
    # layer 1 -> layer 2
    z1 = linear_line(x, Teta[3], Teta[0])
    z2 = linear_line(x, Teta[4], Teta[1])
    a1 = softplus(z1)
    a2 = softplus(z2)
    # layer 2 -> layer 3
    z3 = linear_line(a1, Teta[5], 0)
    z4 = linear_line(a2, Teta[6], 0)
    h = z3 + z4 + Teta[2]
    return h
Example #7
0
 def moment_match(self,
                  y,
                  cav_mean,
                  cav_cov,
                  hyp=None,
                  power=1.0,
                  cubature_func=None):
     """
     Closed form Gaussian moment matching.
     Calculates the log partition function of the EP tilted distribution:
         logZₙ = log ∫ 𝓝ᵃ(yₙ|fₙ,σ²) 𝓝(fₙ|mₙ,vₙ) dfₙ = E[𝓝(yₙ|fₙ,σ²)]
     and its derivatives w.r.t. mₙ, which are required for moment matching.
     :param y: observed data (yₙ) [scalar]
     :param cav_mean: cavity mean (mₙ) [scalar]
     :param cav_cov: cavity variance (vₙ) [scalar]
     :param hyp: observation noise variance (σ²) [scalar]
     :param power: EP power / fraction (a) - this is never required for the Gaussian likelihood [scalar]
     :param cubature_func: not used
     :return:
         lZ: the log partition function, logZₙ [scalar]
         dlZ: first derivative of logZₙ w.r.t. mₙ (if derivatives=True) [scalar]
         d2lZ: second derivative of logZₙ w.r.t. mₙ (if derivatives=True) [scalar]
     """
     hyp = softplus(self.hyp) if hyp is None else hyp
     return gaussian_moment_match(y, cav_mean, cav_cov, hyp)
    def reconstruct_test_datum(self):
        self.y_test = self.y(np.random.choice(self.N, 1))

        h_qX = softplus(plus(dot(self.W1_qX, self.y_test.T), self.b1_qX))
        mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX)
        log_sigma_qX = mul( 0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX))

        self.phi_test  = mu_qX.T  # [BxR]
        (self.Phi_test,self.cPhi_test,self.iPhi_test,self.logDetPhi_test) \
            = diagCholInvLogDet_fromLogDiag(log_sigma_qX)

        self.Xz_test = plus( self.phi_test, dot(self.cPhi_test, self.xi[0,:]))

        self.Kzz_test = kfactory.kernel(self.Xz_test, None,    self.log_theta)
        self.Kzu_test = kfactory.kernel(self.Xz_test, self.Xu, self.log_theta)

        self.A_test  = dot(self.Kzu_test, self.iKuu)
        self.C_test  = minus( self.Kzz_test, dot(self.A_test, self.Kzu_test.T))
        self.cC_test, self.iC_test, self.logDetC_test = cholInvLogDet(self.C_test, self.B, self.jitter)

        self.u_test  = plus( self.kappa, (dot(self.cKappa, self.alpha)))

        self.mu_test = dot(self.A_test, self.u_test)

        self.z_test  = plus(self.mu_test, (dot(self.cC_test, self.beta[0,:])))
Example #9
0
    def reconstruct_test_datum(self):
        self.y_test = self.y(np.random.choice(self.N, 1))

        h_qX = softplus(plus(dot(self.W1_qX, self.y_test.T), self.b1_qX))
        mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX)
        log_sigma_qX = mul(0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX))

        self.phi_test = mu_qX.T  # [BxR]
        (self.Phi_test,self.cPhi_test,self.iPhi_test,self.logDetPhi_test) \
            = diagCholInvLogDet_fromLogDiag(log_sigma_qX)

        self.Xz_test = plus(self.phi_test, dot(self.cPhi_test, self.xi[0, :]))

        self.Kzz_test = kfactory.kernel(self.Xz_test, None, self.log_theta)
        self.Kzu_test = kfactory.kernel(self.Xz_test, self.Xu, self.log_theta)

        self.A_test = dot(self.Kzu_test, self.iKuu)
        self.C_test = minus(self.Kzz_test, dot(self.A_test, self.Kzu_test.T))
        self.cC_test, self.iC_test, self.logDetC_test = cholInvLogDet(
            self.C_test, self.B, self.jitter)

        self.u_test = plus(self.kappa, (dot(self.cKappa, self.alpha)))

        self.mu_test = dot(self.A_test, self.u_test)

        self.z_test = plus(self.mu_test, (dot(self.cC_test, self.beta[0, :])))
Example #10
0
 def negative_log_predictive_density(self, t=None, y=None, r=None):
     """
     Compute the (normalised) negative log predictive density (NLPD) of the test data yₙ*:
         NLPD = - ∑ₙ log ∫ p(yₙ*|fₙ*) 𝓝(fₙ*|mₙ*,vₙ*) dfₙ*
     where fₙ* is the function value at the test location.
     The above can be computed using the EP moment matching method, which we vectorise using vmap.
     :param t: test time steps [M, 1]
     :param y: test observations [M, 1]
     :param r: test spatial locations [M, R]
     :return:
         NLPD: the negative log predictive density for the test data
     """
     if t is None:
         t, y, r = self.t, self.y, self.r
     (t, y, r, r_test, dt, train_id, test_id,
      mask) = test_input_admin(self.t, self.y, self.r, t, y, r)
     return_full = r_test.shape[1] != r.shape[
         1]  # are spatial test locations different size to training locations?
     # run the filter and smooth across both train and test points
     posterior_mean, posterior_cov, _ = self.predict_everywhere(
         y, r, dt, train_id, mask, sampling=False, return_full=return_full)
     test_mean, test_cov = posterior_mean[test_id], posterior_cov[test_id]
     hyp_prior, hyp_lik = softplus_list(self.prior.hyp), softplus(
         self.likelihood.hyp)
     if return_full:
         measure_func = vmap(self.compute_measurement, (0, 0, 0, None))
         test_mean, test_cov = measure_func(r_test, test_mean, test_cov,
                                            hyp_prior)
     # vectorise the EP moment matching method
     lpd_func = vmap(self.likelihood.moment_match,
                     (0, 0, 0, None, None, None))
     log_predictive_density, _, _ = lpd_func(y[test_id], test_mean,
                                             test_cov, hyp_lik, 1, None)
     return -np.mean(log_predictive_density)  # mean = normalised sum
Example #11
0
 def _free_energy_with_z(self, z):
     """Return binary rbm style free energy in shape: [batch_size]"""
     zbias_term = tf.matmul(z, self.zbias, transpose_b=True)
     zbias_term = tf.reshape(zbias_term, [-1]) # flattern
     h_total_input = tf.matmul(z, self.weights) + self.hbias
     softplus_term = utils.softplus(h_total_input)
     sum_softplus = tf.reduce_sum(softplus_term, 1)
     return -zbias_term - sum_softplus
Example #12
0
 def save_tensorboard_embeddings(self, u, v, embedding_dim, name_u, name_v,
                                 writer, global_step, matrix_bin):
     u = softplus(u.weight.detach().cpu().numpy().reshape(
         (self.vert, self.horz, embedding_dim)))
     u = np.expand_dims(np.stack([u, u, u], axis=0), axis=0)
     writer.add_images(name_u, refactor(u), global_step=global_step)
     v = softplus(v.weight.detach().cpu().numpy())
     fig = plt.figure()
     plt.plot(v)
     writer.add_figure(name_v, fig, global_step=global_step)
     dot_product = np.dot(u, v.T)[0, 0, ...]
     myocardium_dot_prod = self.get_video(dot_product,
                                          matrix_bin,
                                          cmap='rainbow')
     writer.add_video(name_u + '_' + name_v + '_dotprod',
                      myocardium_dot_prod,
                      global_step=global_step)
Example #13
0
 def __init__(self, variance=0.1):
     """
     param hyp: observation noise
     """
     super().__init__(hyp=variance)
     self.name = 'Audio Amplitude Demodulation'
     self.link_fn = lambda f: softplus(f)
     self.dlink_fn = lambda f: sigmoid(f)  # derivative of the link function
Example #14
0
 def conditional_moments(self, f, hyp=None):
     """
     The first two conditional moments of a Gaussian are the mean and variance:
         E[y|f] = f
         Var[y|f] = σ²
     """
     hyp = softplus(self.hyp) if hyp is None else hyp
     return f, hyp.reshape(-1, 1)
Example #15
0
    def produce(self, controller):
        """
            writeHead.recurrence(controller, previous_weight) -> key           (batchsize x N),
                                                                 add           (batchsize x N),
                                                                 erase         (batchsize x N),
                                                                 shift         (batchsize x 3),
                                                                 sharpen       (batchsize x 1),
                                                                 strengthen    (batchsize x 1),
                                                                 interpolation (batchsize x 1)
            
            produces controller parameters to manipulate/write memory

            @param controller: a batchsize x controller_size matrix, representing the output of the controller
        """

        # key, add, erase -> batchsize x N
        key = T.dot(controller, self.weights["controller->key"])
        add = T.tanh(T.dot(controller, self.weights["controller->add"]))
        erase = T.nnet.sigmoid(T.dot(controller, self.weights["controller->erase"]))        # SIGMOID

        # shift -> batchsize x 3 
        shift = T.nnet.softmax(T.dot(controller, self.weights["controller->shift"]))        # SOFTMAX

        backward_shift = shift[:, 0]
        stay_forward_shift = shift[:, 1:3]      # represents the shift values for STAY and FORWARD

        zeros_size = self.memory_slots - 3

        # We are concatenating along the second axis, we're basically moving the first element (which represents the backward shift) to the front
        # ex:
        # There are 7 memory slots 
        # Useless zeros are wrapped in [] to increase history.
        # 0.2 0.9 0.1 [0.0 0.0 0.0 0.0] -> 0.9 0.1 [0.0 0.0 0.0 0.0] 0.2
        true_shift = T.concatenate([stay_forward_shift, T.zeros([self.batch_size, zeros_size]), backward_shift.reshape([self.batch_size, 1])], axis = 1)      # WRAP

        # sharpen, strengthen, interpolation -> batchsize x 1
        # sharpen and strengthen must both be greater than or equal to 1, so we'll apply the softplus function (Graves et al., 2016)
        sharpen = softplus(T.dot(controller, self.weights["controller->sharpen"]))      # SOFTPLUS                         
        strengthen = softplus(T.dot(controller, self.weights["controller->strengthen"]))        # SOFTPLUS 

        interpolation = T.nnet.sigmoid(T.dot(controller, self.weights["controller->interpolation"]))        # SIGMOID

        return key, add, erase, true_shift, T.addbroadcast(sharpen, 1), T.addbroadcast(strengthen, 1), T.addbroadcast(interpolation, 1)
Example #16
0
    def free_energy(self, vis_samples):
        """Compute the free energy defined on visibles.

        return: free energy of shape: [batch_size, 1]
        """
        vbias_term = tf.matmul(vis_samples, self.vbias, transpose_b=True)
        vbias_term = tf.reshape(vbias_term, [-1])  # flattern
        h_total_input = tf.matmul(vis_samples, self.weights) + self.hbias
        softplus_term = utils.softplus(h_total_input)
        sum_softplus = tf.reduce_sum(softplus_term, 1)
        return -vbias_term - sum_softplus
Example #17
0
 def evaluate_log_likelihood(self, y, f, hyp=None):
     """
     Evaluate the log-Gaussian function log𝓝(yₙ|fₙ,σ²).
     Can be used to evaluate Q cubature points.
     :param y: observed data yₙ [scalar]
     :param f: mean, i.e. the latent function value fₙ [Q, 1]
     :param hyp: likelihood variance σ² [scalar]
     :return:
         log𝓝(yₙ|fₙ,σ²), where σ² is the observation noise [Q, 1]
     """
     hyp = softplus(self.hyp) if hyp is None else hyp
     return -0.5 * np.log(2 * pi * hyp) - 0.5 * (y - f)**2 / hyp
Example #18
0
 def __init__(self, link='exp'):
     """
     :param link: link function, either 'exp' or 'logistic'
     """
     super().__init__(hyp=None)
     if link == 'exp':
         self.link_fn = lambda mu: np.exp(mu)
         self.dlink_fn = lambda mu: np.exp(mu)
     elif link == 'logistic':
         self.link_fn = lambda mu: softplus(mu)
         self.dlink_fn = lambda mu: sigmoid(mu)
     else:
         raise NotImplementedError('link function not implemented')
     self.name = 'Poisson'
Example #19
0
 def __init__(self, link='softplus'):
     """
     :param link: link function, either 'exp' or 'softplus' (note that the link is modified with an offset)
     """
     super().__init__(hyp=None)
     if link == 'exp':
         self.link_fn = lambda mu: np.exp(mu - 0.5)
         self.dlink_fn = lambda mu: np.exp(mu - 0.5)
     elif link == 'softplus':
         self.link_fn = lambda mu: softplus(mu - 0.5) + 1e-10
         self.dlink_fn = lambda mu: sigmoid(mu - 0.5)
     else:
         raise NotImplementedError('link function not implemented')
     self.name = 'Heteroscedastic Noise'
Example #20
0
def gradient_step(i, state, mod):
    params = get_params(state)
    mod.prior.hyp = params[0]
    mod.likelihood.hyp = params[1]

    # grad(Filter) + Smoother:
    # neg_log_marg_lik, gradients = mod.run()
    neg_log_marg_lik, gradients = mod.run_two_stage()

    prior_params = softplus_list(params[0])
    # print('iter %2d: var1=%1.2f len1=%1.2f om1=%1.2f var2=%1.2f len2=%1.2f om2=%1.2f var3=%1.2f len3=%1.2f om3=%1.2f '
    #       'var4=%1.2f len4=%1.2f var5=%1.2f len5=%1.2f var6=%1.2f len6=%1.2f '
    #       'vary=%1.2f, nlml=%2.2f' %
    #       (i, prior_params[0][0], prior_params[0][1], prior_params[0][2],
    #        prior_params[1][0], prior_params[1][1], prior_params[1][2],
    #        prior_params[2][0], prior_params[2][1], prior_params[2][2],
    #        prior_params[3][0], prior_params[3][1],
    #        prior_params[4][0], prior_params[4][1],
    #        prior_params[5][0], prior_params[5][1],
    #        softplus(params[1]), neg_log_marg_lik))
    # print('iter %2d: len1=%1.2f om1=%1.2f len2=%1.2f om2=%1.2f len3=%1.2f om3=%1.2f '
    #       'var4=%1.2f len4=%1.2f var5=%1.2f len5=%1.2f var6=%1.2f len6=%1.2f '
    #       'vary=%1.2f, nlml=%2.2f' %
    #       (i, prior_params[0][0], prior_params[0][1],
    #        prior_params[1][0], prior_params[1][1],
    #        prior_params[2][0], prior_params[2][1],
    #        prior_params[3][0], prior_params[3][1],
    #        prior_params[4][0], prior_params[4][1],
    #        prior_params[5][0], prior_params[5][1],
    #        softplus(params[1]), neg_log_marg_lik))
    print(
        'iter %2d: len1=%1.2f om1=%1.2f len2=%1.2f om2=%1.2f len3=%1.2f om3=%1.2f '
        'len4=%1.2f len5=%1.2f len6=%1.2f '
        'vary=%1.2f, nlml=%2.2f' %
        (i, prior_params[0][0], prior_params[0][1], prior_params[1][0],
         prior_params[1][1], prior_params[2][0], prior_params[2][1],
         prior_params[3], prior_params[4], prior_params[5], softplus(
             params[1]), neg_log_marg_lik))

    if plot_intermediate:
        plot(mod, i)

    return opt_update(i, gradients, state)
Example #21
0
for _ in range(args.maxIter):

    # Maximize ELBO
    grads = elementwise_grad(elbo)(
        (lambda_pi, lambda_phi, lambda_m, lambda_beta, lambda_nu, lambda_w))

    # Variational parameter updates (gradient ascent)
    lambda_pi -= ps['lambda_pi'] * grads[0]
    lambda_phi -= ps['lambda_phi'] * grads[1]
    lambda_m -= ps['lambda_m'] * grads[2]
    lambda_beta -= ps['lambda_beta'] * grads[3]
    lambda_nu -= ps['lambda_nu'] * grads[4]
    lambda_w -= ps['lambda_w'] * grads[5]

    lambda_phi = agnp.array([softmax(lambda_phi[i]) for i in range(N)])
    lambda_beta = softplus(lambda_beta)
    lambda_nu = softplus(lambda_nu)
    lambda_pi = softplus(lambda_pi)
    lambda_w = agnp.array(
        [agnp.dot(lambda_w[k], lambda_w[k].T) for k in range(K)])

    # ELBO computation
    lb = elbo(
        (lambda_pi, lambda_phi, lambda_m, lambda_beta, lambda_nu, lambda_w))
    lbs.append(lb)

    if VERBOSE:
        print('\n******* ITERATION {} *******'.format(n_iters))
        print('lambda_pi: {}'.format(lambda_pi))
        print('lambda_beta: {}'.format(lambda_beta))
        print('lambda_nu: {}'.format(lambda_nu))
Example #22
0
 def __init__(self, num_ds_dim=4):
     super(SigmoidFlow, self).__init__()
     self.num_ds_dim = num_ds_dim
     self.act_a = lambda x: utils.softplus(x)
     self.act_b = lambda x: x
     self.act_w = lambda x: utils.softmax(x, dim=2)
Example #23
0
 def predict(self, y=None, dt=None, mask=None, site_params=None, sampling=False,
             r=None, return_full=False, compute_nlpd=True):
     """
     Calculate posterior predictive distribution p(f*|f,y) by filtering and smoothing across the
     training & test locations.
     This function is also used during posterior sampling to smooth the auxillary data sampled from the prior.
     The output shapes depend on return_full
     :param y: observations (nans at test locations) [M, 1]
     :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [M, 1]
     :param mask: a boolean array signifying which elements are observed and which are nan [M, 1]
     :param site_params: the sites computed during a previous inference proceedure [2, M, obs_dim]
     :param sampling: notify whether we are doing posterior sampling
     :param r: spatial locations [M, R]
     :param return_full: flag to notify if we are handling the case where spatial test locations are a different
                         size to training locations
     :param compute_nlpd: flag to notify whether to compute the negative log predictive density of the test data
     :return:
         posterior_mean: the posterior predictive mean [M, state_dim] or [M, obs_dim]
         posterior_cov: the posterior predictive (co)variance [M, M, state_dim] or [M, obs_dim]
         site_params: the site parameters. If none are provided then new sites are computed [2, M, obs_dim]
     """
     y = self.y_all if y is None else y
     r = self.r_all if r is None else r
     dt = self.dt_all if dt is None else dt
     mask = self.mask if mask is None else mask
     params = [self.prior.hyp.copy(), self.likelihood.hyp.copy()]
     site_params = self.sites.site_params if site_params is None else site_params
     if site_params is not None and not sampling:
         # construct a vector of site parameters that is the full size of the test data
         # test site parameters are 𝓝(0,∞), and will not be used
         site_mean = np.zeros([dt.shape[0], self.func_dim, 1])
         site_cov = 1e5 * np.tile(np.eye(self.func_dim), (dt.shape[0], 1, 1))
         # replace parameters at training locations with the supplied sites
         site_mean = index_add(site_mean, index[self.train_id], site_params[0])
         site_cov = index_update(site_cov, index[self.train_id], site_params[1])
         site_params = (site_mean, site_cov)
     _, (filter_mean, filter_cov, site_params) = self.kalman_filter(y, dt, params, True, mask, site_params, r)
     _, posterior_mean, posterior_cov = self.rauch_tung_striebel_smoother(params, filter_mean, filter_cov, dt,
                                                                          True, return_full, None, None, r)
     if compute_nlpd:
         nlpd_test = self.negative_log_predictive_density(self.t_all[self.test_id], self.y_all[self.test_id],
                                                          posterior_mean[self.test_id],
                                                          posterior_cov[self.test_id],
                                                          softplus_list(params[0]), softplus(params[1]),
                                                          return_full)
     else:
         nlpd_test = np.nan
     # in the spatial model, the train and test points may be of different size. This deals with that situation:
     if return_full:
         measure_func = vmap(
             self.compute_measurement, (0, 0, 0, None)
         )
         posterior_mean, posterior_cov = measure_func(self.r_test,
                                                      posterior_mean[self.test_id], posterior_cov[self.test_id],
                                                      softplus_list(self.prior.hyp))
     return posterior_mean, posterior_cov, site_params, nlpd_test
Example #24
0
 def variance(self):
     return softplus(self.hyp)
Example #25
0
              "wb") as fp:
        pickle.dump(nlpd, fp)

    # with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt", "rb") as fp:
    #     nlpd_show = pickle.load(fp)
    # print(nlpd_show)

if plot_final:
    x_pred = model.t_all[:, 0]
    # link = model.likelihood.link_fn
    # lb = posterior_mean[:, 0, 0] - np.sqrt(posterior_var[:, 0, 0]) * 1.96
    # ub = posterior_mean[:, 0, 0] + np.sqrt(posterior_var[:, 0, 0]) * 1.96
    test_id = model.test_id

    posterior_mean_subbands = posterior_mean[:, :3, 0]
    posterior_mean_modulators = softplus(posterior_mean[:, 3:, 0])
    posterior_mean_sig = np.sum(posterior_mean_subbands *
                                posterior_mean_modulators,
                                axis=-1)
    posterior_var_subbands = posterior_var[:, :3, 0]
    posterior_var_modulators = softplus(posterior_var[:, 3:, 0])

    print('plotting ...')
    plt.figure(1, figsize=(12, 5))
    plt.clf()
    plt.plot(x, y, 'k', label='signal', linewidth=0.6)
    plt.plot(x_test, y_test, 'g.', label='test', markersize=4)
    plt.plot(x_pred,
             posterior_mean_sig,
             'r',
             label='posterior mean',
Example #26
0
 def sparsity_cost(self, vis):
     p_target = tf.constant(0.01, dtype=tf.float32, shape=[1, self.num_hid])
     h_total_input = tf.matmul(vis, self.weights) + self.hbias
     penalty = (-tf.matmul(p_target, h_total_input, transpose_b=True) +
                tf.reduce_sum(utils.softplus(h_total_input), 1))
     return tf.reduce_mean(penalty)
Example #27
0
 def kalman_filter(self,
                   y,
                   dt,
                   params,
                   store=False,
                   mask=None,
                   site_params=None,
                   r=None):
     """
     Run the Kalman filter to get p(fₙ|y₁,...,yₙ).
     The Kalman update step invloves some control flow to work out whether we are
         i) initialising the sites
         ii) using supplied sites
         iii) performing a Gaussian update with fixed parameters (e.g. in posterior sampling or ELBO calc.)
     If store is True then we compute and return the intermediate filtering distributions
     p(fₙ|y₁,...,yₙ) and sites sₙ(fₙ), otherwise we do not store the intermediates and simply
     return the energy / negative log-marginal likelihood, -log p(y).
     :param y: observed data [N, obs_dim]
     :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [N, 1]
     :param params: the model parameters, i.e the hyperparameters of the prior & likelihood
     :param store: flag to notify whether to store the intermediates
     :param mask: boolean array signifying which elements of y are observed [N, obs_dim]
     :param site_params: the Gaussian approximate likelihoods [2, N, obs_dim]
     :param r: spatial input locations
     :return:
         if store is True:
             neg_log_marg_lik: the filter energy, i.e. negative log-marginal likelihood -log p(y),
                               used for hyperparameter optimisation (learning) [scalar]
             filtered_mean: intermediate filtering means [N, state_dim, 1]
             filtered_cov: intermediate filtering covariances [N, state_dim, state_dim]
             site_mean: mean of the approximate likelihood sₙ(fₙ) [N, obs_dim]
             site_cov: variance of the approximate likelihood sₙ(fₙ) [N, obs_dim]
         otherwise:
             neg_log_marg_lik: the filter energy, i.e. negative log-marginal likelihood -log p(y),
                               used for hyperparameter optimisation (learning) [scalar]
     """
     theta_prior, theta_lik = softplus_list(params[0]), softplus(params[1])
     self.update_model(
         theta_prior
     )  # all model components that are not static must be computed inside the function
     N = dt.shape[0]
     with loops.Scope() as s:
         s.neg_log_marg_lik = 0.0  # negative log-marginal likelihood
         s.m, s.P = self.minf, self.Pinf
         if store:
             s.filtered_mean = np.zeros([N, self.state_dim, 1])
             s.filtered_cov = np.zeros([N, self.state_dim, self.state_dim])
             s.site_mean = np.zeros([N, self.func_dim, 1])
             s.site_cov = np.zeros([N, self.func_dim, self.func_dim])
         for n in s.range(N):
             y_n = y[n][..., np.newaxis]
             # -- KALMAN PREDICT --
             #  mₙ⁻ = Aₙ mₙ₋₁
             #  Pₙ⁻ = Aₙ Pₙ₋₁ Aₙ' + Qₙ, where Qₙ = Pinf - Aₙ Pinf Aₙ'
             A = self.prior.state_transition(dt[n], theta_prior)
             m_ = A @ s.m
             P_ = A @ (s.P - self.Pinf) @ A.T + self.Pinf
             # --- KALMAN UPDATE ---
             # Given previous predicted mean mₙ⁻ and cov Pₙ⁻, incorporate yₙ to get filtered mean mₙ &
             # cov Pₙ and compute the marginal likelihood p(yₙ|y₁,...,yₙ₋₁)
             H = self.prior.measurement_model(r[n], theta_prior)
             predict_mean = H @ m_
             predict_cov = H @ P_ @ H.T
             if mask is not None:  # note: this is a bit redundant but may come in handy in multi-output problems
                 y_n = np.where(mask[n][..., np.newaxis],
                                predict_mean[:y_n.shape[0]],
                                y_n)  # fill in masked obs with expectation
             log_lik_n, site_mean, site_cov = self.sites.update(
                 self.likelihood, y_n, predict_mean, predict_cov, theta_lik,
                 None)
             if site_params is not None:  # use supplied site parameters to perform the update
                 site_mean, site_cov = site_params[0][n], site_params[1][n]
             # modified Kalman update (see Nickish et. al. ICML 2018 or Wilkinson et. al. ICML 2019):
             S = predict_cov + site_cov
             HP = H @ P_
             K = solve(S, HP).T  # PH'(S^-1)
             s.m = m_ + K @ (site_mean - predict_mean)
             s.P = P_ - K @ HP
             if mask is not None:  # note: this is a bit redundant but may come in handy in multi-output problems
                 s.m = np.where(np.any(mask[n]), m_, s.m)
                 s.P = np.where(np.any(mask[n]), P_, s.P)
                 log_lik_n = np.where(mask[n][..., 0],
                                      np.zeros_like(log_lik_n), log_lik_n)
             s.neg_log_marg_lik -= np.sum(log_lik_n)
             if store:
                 s.filtered_mean = index_add(s.filtered_mean, index[n, ...],
                                             s.m)
                 s.filtered_cov = index_add(s.filtered_cov, index[n, ...],
                                            s.P)
                 s.site_mean = index_add(s.site_mean, index[n, ...],
                                         site_mean)
                 s.site_cov = index_add(s.site_cov, index[n, ...], site_cov)
     if store:
         return s.neg_log_marg_lik, (s.filtered_mean, s.filtered_cov,
                                     (s.site_mean, s.site_cov))
     return s.neg_log_marg_lik
Example #28
0
 def rauch_tung_striebel_smoother(self,
                                  params,
                                  m_filtered,
                                  P_filtered,
                                  dt,
                                  store=False,
                                  return_full=False,
                                  y=None,
                                  site_params=None,
                                  r=None):
     """
     Run the RTS smoother to get p(fₙ|y₁,...,y_N),
     i.e. compute p(f)𝚷ₙsₙ(fₙ) where sₙ(fₙ) are the sites (approx. likelihoods).
     If sites are provided, then it is assumed they are to be updated, which is done by
     calling the site-specific update() method.
     :param params: the model parameters, i.e the hyperparameters of the prior & likelihood
     :param m_filtered: the intermediate distribution means computed during filtering [N, state_dim, 1]
     :param P_filtered: the intermediate distribution covariances computed during filtering [N, state_dim, state_dim]
     :param dt: step sizes Δtₙ = tₙ - tₙ₋₁ [N, 1]
     :param store: a flag determining whether to store and return state mean and covariance
     :param return_full: a flag determining whether to return the full state distribution or just the function(s)
     :param y: observed data [N, obs_dim]
     :param site_params: the Gaussian approximate likelihoods [2, N, obs_dim]
     :param r: spatial input locations
     :return:
         var_exp: the sum of the variational expectations [scalar]
         smoothed_mean: the posterior marginal means [N, obs_dim]
         smoothed_var: the posterior marginal variances [N, obs_dim]
         site_params: the updated sites [2, N, obs_dim]
     """
     theta_prior, theta_lik = softplus_list(params[0]), softplus(params[1])
     self.update_model(
         theta_prior
     )  # all model components that are not static must be computed inside the function
     N = dt.shape[0]
     dt = np.concatenate([dt[1:], np.array([0.0])], axis=0)
     with loops.Scope() as s:
         s.m, s.P = m_filtered[-1, ...], P_filtered[-1, ...]
         if return_full:
             s.smoothed_mean = np.zeros([N, self.state_dim, 1])
             s.smoothed_cov = np.zeros([N, self.state_dim, self.state_dim])
         else:
             s.smoothed_mean = np.zeros([N, self.func_dim, 1])
             s.smoothed_cov = np.zeros([N, self.func_dim, self.func_dim])
         if site_params is not None:
             s.site_mean = np.zeros([N, self.func_dim, 1])
             s.site_var = np.zeros([N, self.func_dim, self.func_dim])
         for n in s.range(N - 1, -1, -1):
             # --- First compute the smoothing distribution: ---
             A = self.prior.state_transition(
                 dt[n], theta_prior
             )  # closed form integration of transition matrix
             m_predicted = A @ m_filtered[n, ...]
             tmp_gain_cov = A @ P_filtered[n, ...]
             P_predicted = A @ (P_filtered[n, ...] -
                                self.Pinf) @ A.T + self.Pinf
             # backward Kalman gain:
             # G = F * A' * P^{-1}
             # since both F(iltered) and P(redictive) are cov matrices, thus self-adjoint, we can take the transpose:
             #   = (P^{-1} * A * F)'
             G_transpose = solve(P_predicted, tmp_gain_cov)  # (P^-1)AF
             s.m = m_filtered[n, ...] + G_transpose.T @ (s.m - m_predicted)
             s.P = P_filtered[
                 n, ...] + G_transpose.T @ (s.P - P_predicted) @ G_transpose
             H = self.prior.measurement_model(r[n], theta_prior)
             if store:
                 if return_full:
                     s.smoothed_mean = index_add(s.smoothed_mean,
                                                 index[n, ...], s.m)
                     s.smoothed_cov = index_add(s.smoothed_cov,
                                                index[n, ...], s.P)
                 else:
                     s.smoothed_mean = index_add(s.smoothed_mean,
                                                 index[n, ...], H @ s.m)
                     s.smoothed_cov = index_add(s.smoothed_cov, index[n,
                                                                      ...],
                                                H @ s.P @ H.T)
             # --- Now update the site parameters: ---
             if site_params is not None:
                 # extract mean and var from state:
                 post_mean, post_cov = H @ s.m, H @ s.P @ H.T
                 # calculate the new sites
                 _, site_mu, site_cov = self.sites.update(
                     self.likelihood, y[n][...,
                                           np.newaxis], post_mean, post_cov,
                     theta_lik, (site_params[0][n], site_params[1][n]))
                 s.site_mean = index_add(s.site_mean, index[n, ...],
                                         site_mu)
                 s.site_var = index_add(s.site_var, index[n, ...], site_cov)
     if site_params is not None:
         site_params = (s.site_mean, s.site_var)
     if store:
         return site_params, s.smoothed_mean, s.smoothed_cov
     return site_params
Example #29
0
    with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt",
              "wb") as fp:
        pickle.dump(nlpd, fp)

    # with open("output/" + str(method) + "_" + str(fold) + "_nlpd.txt", "rb") as fp:
    #     nlpd_show = pickle.load(fp)
    # print(nlpd_show)

if plot_final:

    def diag(Q):
        vectorised_diag = vmap(jnp.diag, 0)
        return vectorised_diag(Q)

    posterior_mean_subbands = posterior_mean[:, :3]
    posterior_mean_modulators = softplus(posterior_mean[:, 3:])
    posterior_mean_sig = np.sum(posterior_mean_subbands *
                                posterior_mean_modulators,
                                axis=-1)
    posterior_var_subbands = diag(posterior_var[:, :3, :3])
    posterior_var_modulators = softplus(diag(posterior_var[:, 3:, 3:]))
    lb_subbands = posterior_mean_subbands - np.sqrt(
        posterior_var_subbands) * 1.96
    ub_subbands = posterior_mean_subbands + np.sqrt(
        posterior_var_subbands) * 1.96
    lb_modulators = softplus(posterior_mean_modulators -
                             np.sqrt(posterior_var_modulators) * 1.96)
    ub_modulators = softplus(posterior_mean_modulators +
                             np.sqrt(posterior_var_modulators) * 1.96)

    color1 = [0.2667, 0.4471, 0.7098]  # blue
    def __init__(self,
                 numberOfInducingPoints,  # Number of inducing ponts in sparse GP
                 batchSize,              # Size of mini batch
                 dimX,                   # Dimensionality of the latent co-ordinates
                 dimZ,                   # Dimensionality of the latent variables
                 data,                   # [NxP] matrix of observations
                 kernelType='ARD',
                 encoderType_qX='FreeForm2',  # 'MLP', 'Kernel'.
                 encoderType_rX='FreeForm2',  # 'MLP', 'Kernel'
                 Xu_optimise=False,
                 numberOfEncoderHiddenUnits=10
                 ):

        self.numTestSamples = 5000

        # set the data
        data = np.asarray(data, dtype=precision)
        self.N = data.shape[0]  # Number of observations
        self.P = data.shape[1]  # Dimension of each observation
        self.M = numberOfInducingPoints
        self.B = batchSize
        self.R = dimX
        self.Q = dimZ
        self.H = numberOfEncoderHiddenUnits

        self.encoderType_qX = encoderType_qX
        self.encoderType_rX = encoderType_rX
        self.Xu_optimise = Xu_optimise

        self.y = th.shared(data)
        self.y.name = 'y'

        if kernelType == 'RBF':
            self.numberOfKernelParameters = 2
        elif kernelType == 'RBFnn':
            self.numberOfKernelParameters = 1
        elif kernelType == 'ARD':
            self.numberOfKernelParameters = self.R + 1
        else:
            raise RuntimeError('Unrecognised kernel type')

        self.lowerBound = -np.inf  # Lower bound

        self.numberofBatchesPerEpoch = int(np.ceil(np.float32(self.N) / self.B))
        numPad = self.numberofBatchesPerEpoch * self.B - self.N

        self.batchStream = srng.permutation(n=self.N)
        self.padStream   = srng.choice(size=(numPad,), a=self.N,
                                       replace=False, p=None, ndim=None, dtype='int32')

        self.batchStream.name = 'batchStream'
        self.padStream.name = 'padStream'

        self.iterator = th.shared(0)
        self.iterator.name = 'iterator'

        self.allBatches = T.reshape(T.concatenate((self.batchStream, self.padStream)), [self.numberofBatchesPerEpoch, self.B])
        self.currentBatch = T.flatten(self.allBatches[self.iterator, :])

        self.allBatches.name = 'allBatches'
        self.currentBatch.name = 'currentBatch'

        self.y_miniBatch = self.y[self.currentBatch, :]
        self.y_miniBatch.name = 'y_miniBatch'

        self.jitterDefault = np.float64(0.0001)
        self.jitterGrowthFactor = np.float64(1.1)
        self.jitter = th.shared(np.asarray(self.jitterDefault, dtype='float64'), name='jitter')

        kfactory = kernelFactory(kernelType)

        # kernel parameters
        self.log_theta = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_theta', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff
        self.log_omega = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_omega', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff
        self.log_gamma = sharedZeroMatrix(1, self.numberOfKernelParameters, 'log_gamma', broadcastable=(True,False)) # parameters of Kuu, Kuf, Kff

        # Random variables
        self.xi    = srng.normal(size=(self.B, self.R), avg=0.0, std=1.0, ndim=None)
        self.alpha = srng.normal(size=(self.M, self.Q), avg=0.0, std=1.0, ndim=None)
        self.beta  = srng.normal(size=(self.B, self.Q), avg=0.0, std=1.0, ndim=None)
        self.xi.name    = 'xi'
        self.alpha.name = 'alpha'
        self.beta.name  = 'beta'

        self.sample_xi    = th.function([], self.xi)
        self.sample_alpha = th.function([], self.alpha)
        self.sample_beta  = th.function([], self.beta)

        self.sample_batchStream = th.function([], self.batchStream)
        self.sample_padStream   = th.function([], self.padStream)

        self.getCurrentBatch = th.function([], self.currentBatch, no_default_updates=True)

        # Compute parameters of q(X)
        if self.encoderType_qX == 'FreeForm1' or self.encoderType_qX == 'FreeForm2':
            # Have a normal variational distribution over location of latent co-ordinates

            self.phi_full = sharedZeroMatrix(self.N, self.R, 'phi_full')
            self.phi = self.phi_full[self.currentBatch, :]
            self.phi.name = 'phi'

            if encoderType_qX == 'FreeForm1':

                self.Phi_full_sqrt = sharedZeroMatrix(self.N, self.N, 'Phi_full_sqrt')

                Phi_batch_sqrt = self.Phi_full_sqrt[self.currentBatch][:, self.currentBatch]
                Phi_batch_sqrt.name = 'Phi_batch_sqrt'

                self.Phi = dot(Phi_batch_sqrt, Phi_batch_sqrt.T, 'Phi')

                self.cPhi, _, self.logDetPhi = cholInvLogDet(self.Phi, self.B, 0)

                self.qX_vars = [self.Phi_full_sqrt, self.phi_full]

            else:

                self.Phi_full_logdiag = sharedZeroArray(self.N, 'Phi_full_logdiag')

                Phi_batch_logdiag = self.Phi_full_logdiag[self.currentBatch]
                Phi_batch_logdiag.name = 'Phi_batch_logdiag'

                self.Phi, self.cPhi, _, self.logDetPhi \
                    = diagCholInvLogDet_fromLogDiag(Phi_batch_logdiag, 'Phi')

                self.qX_vars = [self.Phi_full_logdiag, self.phi_full]

        elif self.encoderType_qX == 'MLP':

            # Auto encode
            self.W1_qX = sharedZeroMatrix(self.H, self.P, 'W1_qX')
            self.W2_qX = sharedZeroMatrix(self.R, self.H, 'W2_qX')
            self.W3_qX = sharedZeroMatrix(1, self.H, 'W3_qX')
            self.b1_qX = sharedZeroVector(self.H, 'b1_qX', broadcastable=(False, True))
            self.b2_qX = sharedZeroVector(self.R, 'b2_qX', broadcastable=(False, True))
            self.b3_qX = sharedZeroVector(1, 'b3_qX', broadcastable=(False, True))

            # [HxB] = softplus( [HxP] . [BxP]^T + repmat([Hx1],[1,B]) )
            h_qX = softplus(plus(dot(self.W1_qX, self.y_miniBatch.T), self.b1_qX), 'h_qX' )
            # [RxB] = sigmoid( [RxH] . [HxB] + repmat([Rx1],[1,B]) )
            mu_qX = plus(dot(self.W2_qX, h_qX), self.b2_qX, 'mu_qX')
            # [1xB] = 0.5 * ( [1xH] . [HxB] + repmat([1x1],[1,B]) )
            log_sigma_qX = mul( 0.5, plus(dot(self.W3_qX, h_qX), self.b3_qX), 'log_sigma_qX')

            self.phi  = mu_qX.T  # [BxR]
            self.Phi, self.cPhi, self.iPhi,self.logDetPhi \
                = diagCholInvLogDet_fromLogDiag(log_sigma_qX, 'Phi')

            self.qX_vars = [self.W1_qX, self.W2_qX, self.W3_qX, self.b1_qX, self.b2_qX, self.b3_qX]

        elif self.encoderType_qX == 'Kernel':

            # Draw the latent coordinates from a GP with data co-ordinates
            self.Phi = kfactory.kernel(self.y_miniBatch, None, self.log_gamma, 'Phi')
            self.phi = sharedZeroMatrix(self.B, self.R, 'phi')
            (self.cPhi, self.iPhi, self.logDetPhi) = cholInvLogDet(self.Phi, self.B, self.jitter)

            self.qX_vars = [self.log_gamma]

        else:
            raise RuntimeError('Unrecognised encoding for q(X): ' + self.encoderType_qX)

        # Variational distribution q(u)
        self.kappa = sharedZeroMatrix(self.M, self.Q, 'kappa')
        self.Kappa_sqrt = sharedZeroMatrix(self.M, self.M, 'Kappa_sqrt')
        self.Kappa = dot(self.Kappa_sqrt, self.Kappa_sqrt.T, 'Kappa')

        (self.cKappa, self.iKappa, self.logDetKappa) \
                    = cholInvLogDet(self.Kappa, self.M, 0)
        self.qu_vars = [self.Kappa_sqrt, self.kappa]

        # Calculate latent co-ordinates Xf
        # [BxR]  = [BxR] + [BxB] . [BxR]
        self.Xz = plus( self.phi, dot(self.cPhi, self.xi), 'Xf' )
        # Inducing points co-ordinates
        self.Xu = sharedZeroMatrix(self.M, self.R, 'Xu')

        # Kernels
        self.Kzz = kfactory.kernel(self.Xz, None,    self.log_theta, 'Kff')
        self.Kuu = kfactory.kernel(self.Xu, None,    self.log_theta, 'Kuu')
        self.Kzu = kfactory.kernel(self.Xz, self.Xu, self.log_theta, 'Kfu')
        self.cKuu, self.iKuu, self.logDetKuu = cholInvLogDet(self.Kuu, self.M, self.jitter)

        # Variational distribution
        # A has dims [BxM] = [BxM] . [MxM]
        self.A = dot(self.Kzu, self.iKuu, 'A')
        # L is the covariance of conditional distribution q(z|u,Xf)
        self.C = minus( self.Kzz, dot(self.A, self.Kzu.T), 'C')
        self.cC, self.iC, self.logDetC = cholInvLogDet(self.C, self.B, self.jitter)

        # Sample u_q from q(u_q) = N(u_q; kappa_q, Kappa )  [MxQ]
        self.u  = plus(self.kappa, (dot(self.cKappa, self.alpha)), 'u')
        # compute mean of z [QxB]
        # [BxQ] = [BxM] * [MxQ]
        self.mu = dot(self.A, self.u, 'mu')
        # Sample f from q(f|u,X) = N( mu_q, C )
        # [BxQ] =
        self.z  = plus(self.mu, (dot(self.cC, self.beta)), 'z')

        self.qz_vars = [self.log_theta]

        self.iUpsilon = plus(self.iKappa, dot(self.A.T, dot(self.iC, self.A) ), 'iUpsilon')
        _, self.Upsilon, self.negLogDetUpsilon = cholInvLogDet(self.iUpsilon, self.M, self.jitter)

        if self.encoderType_rX == 'MLP':

            self.W1_rX = sharedZeroMatrix(self.H, self.Q+self.P, 'W1_rX')
            self.W2_rX = sharedZeroMatrix(self.R, self.H, 'W2_rX')
            self.W3_rX = sharedZeroMatrix(self.R, self.H, 'W3_rX')
            self.b1_rX = sharedZeroVector(self.H, 'b1_rX', broadcastable=(False, True))
            self.b2_rX = sharedZeroVector(self.R, 'b2_rX', broadcastable=(False, True))
            self.b3_rX = sharedZeroVector(self.R, 'b3_rX', broadcastable=(False, True))

            # [HxB] = softplus( [Hx(Q+P)] . [(Q+P)xB] + repmat([Hx1], [1,B]) )
            h_rX = softplus(plus(dot(self.W1_rX, T.concatenate((self.z.T, self.y_miniBatch.T))), self.b1_rX), 'h_rX')
            # [RxB] = softplus( [RxH] . [HxB] + repmat([Rx1], [1,B]) )
            mu_rX = plus(dot(self.W2_rX, h_rX), self.b2_rX, 'mu_rX')
            # [RxB] = 0.5*( [RxH] . [HxB] + repmat([Rx1], [1,B]) )
            log_sigma_rX = mul( 0.5, plus(dot(self.W3_rX, h_rX), self.b3_rX), 'log_sigma_rX')

            self.tau = mu_rX.T

            # Diagonal optimisation of Tau
            self.Tau_isDiagonal = True
            self.Tau = T.reshape(log_sigma_rX, [self.B * self.R, 1])
            self.logDetTau = T.sum(log_sigma_rX)
            self.Tau.name = 'Tau'
            self.logDetTau.name = 'logDetTau'

            self.rX_vars = [self.W1_rX, self.W2_rX, self.W3_rX, self.b1_rX, self.b2_rX, self.b3_rX]

        elif self.encoderType_rX == 'Kernel':

            self.tau = sharedZeroMatrix(self.B, self.R, 'tau')

            # Tau_r [BxB] = kernel( [[BxQ]^T,[BxP]^T].T )
            Tau_r = kfactory.kernel(T.concatenate((self.z.T, self.y_miniBatch.T)).T, None, self.log_omega, 'Tau_r')
            (cTau_r, iTau_r, logDetTau_r) = cholInvLogDet(Tau_r, self.B, self.jitter)

            # self.Tau  = slinalg.kron(T.eye(self.R), Tau_r)
            self.cTau = slinalg.kron(cTau_r, T.eye(self.R))
            self.iTau = slinalg.kron(iTau_r, T.eye(self.R))

            self.logDetTau = logDetTau_r * self.R
            self.tau.name  = 'tau'
            # self.Tau.name  = 'Tau'
            self.cTau.name = 'cTau'
            self.iTau.name = 'iTau'
            self.logDetTau.name = 'logDetTau'

            self.Tau_isDiagonal = False
            self.rX_vars = [self.log_omega]

        else:
            raise RuntimeError('Unrecognised encoding for r(X|z)')

        # Gradient variables - should be all the th.shared variables
        # We always want to optimise these variables
        if self.Xu_optimise:
            self.gradientVariables = [self.Xu]
        else:
            self.gradientVariables = []

        self.gradientVariables.extend(self.qu_vars)
        self.gradientVariables.extend(self.qz_vars)
        self.gradientVariables.extend(self.qX_vars)
        self.gradientVariables.extend(self.rX_vars)

        self.lowerBounds = []

        self.condKappa = myCond()(self.Kappa)
        self.condKappa.name = 'condKappa'
        self.Kappa_conditionNumber = th.function([], self.condKappa, no_default_updates=True)

        self.condKuu = myCond()(self.Kuu)
        self.condKuu.name = 'condKuu'
        self.Kuu_conditionNumber = th.function([], self.condKuu, no_default_updates=True)

        self.condC = myCond()(self.C)
        self.condC.name = 'condC'
        self.C_conditionNumber = th.function([], self.condC, no_default_updates=True)

        self.condUpsilon = myCond()(self.Upsilon)
        self.condUpsilon.name = 'condUpsilon'
        self.Upsilon_conditionNumber = th.function([], self.condUpsilon, no_default_updates=True)

        self.Xz_get_value = th.function([], self.Xz, no_default_updates=True)